36429 lines
2.2 MiB
36429 lines
2.2 MiB
static uint32_t ALIGN_16 static_mask_21[] = {0x1fffff, 0x1fffff, 0x1fffff, 0x1fffff};
|
|
static uint32_t ALIGN_16 static_mask_12[] = {0xfff, 0xfff, 0xfff, 0xfff};
|
|
static uint32_t ALIGN_16 static_mask_10[] = {0x3ff, 0x3ff, 0x3ff, 0x3ff};
|
|
static uint32_t ALIGN_16 static_mask_9[] = {0x1ff, 0x1ff, 0x1ff, 0x1ff};
|
|
static uint32_t ALIGN_16 static_mask_7[] = {0x7f, 0x7f, 0x7f, 0x7f};
|
|
static uint32_t ALIGN_16 static_mask_6[] = {0x3f, 0x3f, 0x3f, 0x3f};
|
|
static uint32_t ALIGN_16 static_mask_5[] = {0x1f, 0x1f, 0x1f, 0x1f};
|
|
static uint32_t ALIGN_16 static_mask_4[] = {0x0f, 0x0f, 0x0f, 0x0f};
|
|
static uint32_t ALIGN_16 static_mask_3[] = {0x07, 0x07, 0x07, 0x07};
|
|
static uint32_t ALIGN_16 static_mask_2[] = {0x03, 0x03, 0x03, 0x03};
|
|
static uint32_t ALIGN_16 static_mask_1[] = {0x01, 0x01, 0x01, 0x01};
|
|
void ANT_compress_qmx_v4::decodeArray(const uint32_t *source, uint64_t len, uint32_t *to, uint64_t destination_integers)
|
|
{
|
|
__m128i mask_21, mask_12, mask_10, mask_9, mask_7, mask_6, mask_5, mask_4, mask_3, mask_2, mask_1;
|
|
uint8_t *in = (uint8_t *)source;
|
|
uint8_t *keys = ((uint8_t *)source) + len - 1;
|
|
|
|
mask_21 = _mm_loadu_si128((__m128i *)static_mask_21);
|
|
mask_12 = _mm_loadu_si128((__m128i *)static_mask_12);
|
|
mask_10 = _mm_loadu_si128((__m128i *)static_mask_10);
|
|
mask_9 = _mm_loadu_si128((__m128i *)static_mask_9);
|
|
mask_7 = _mm_loadu_si128((__m128i *)static_mask_7);
|
|
mask_6 = _mm_loadu_si128((__m128i *)static_mask_6);
|
|
mask_5 = _mm_loadu_si128((__m128i *)static_mask_5);
|
|
mask_4 = _mm_loadu_si128((__m128i *)static_mask_4);
|
|
mask_3 = _mm_loadu_si128((__m128i *)static_mask_3);
|
|
mask_2 = _mm_loadu_si128((__m128i *)static_mask_2);
|
|
mask_1 = _mm_loadu_si128((__m128i *)static_mask_1);
|
|
|
|
while (in <= keys) // <= because there can be a boundary case where the final key is 255*0 bit integers
|
|
{
|
|
switch (*keys--)
|
|
{
|
|
case 0x00:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
#ifdef NO_ZEROS
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1);
|
|
#else
|
|
const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
#endif
|
|
_mm_storeu_si128((__m128i *)to + 0, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 64, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 128, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 192, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 256, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 320, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 384, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 448, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 512, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 576, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 640, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 704, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 768, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 832, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 896, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 960, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 960 + 63, tmp);
|
|
|
|
to += 4096;
|
|
break;
|
|
}
|
|
case 0x01:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
#ifdef NO_ZEROS
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1);
|
|
#else
|
|
const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
#endif
|
|
_mm_storeu_si128((__m128i *)to + 0, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 64, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 128, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 192, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 256, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 320, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 384, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 448, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 512, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 576, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 640, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 704, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 768, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 832, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 896, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 896 + 63, tmp);
|
|
|
|
to += 3840;
|
|
break;
|
|
}
|
|
case 0x02:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
#ifdef NO_ZEROS
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1);
|
|
#else
|
|
const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
#endif
|
|
_mm_storeu_si128((__m128i *)to + 0, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 64, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 128, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 192, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 256, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 320, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 384, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 448, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 512, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 576, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 640, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 704, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 768, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 832, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 832 + 63, tmp);
|
|
|
|
to += 3584;
|
|
break;
|
|
}
|
|
case 0x03:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
#ifdef NO_ZEROS
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1);
|
|
#else
|
|
const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
#endif
|
|
_mm_storeu_si128((__m128i *)to + 0, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 64, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 128, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 192, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 256, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 320, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 384, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 448, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 512, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 576, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 640, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 704, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 768, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 768 + 63, tmp);
|
|
|
|
to += 3328;
|
|
break;
|
|
}
|
|
case 0x04:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
#ifdef NO_ZEROS
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1);
|
|
#else
|
|
const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
#endif
|
|
_mm_storeu_si128((__m128i *)to + 0, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 64, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 128, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 192, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 256, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 320, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 384, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 448, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 512, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 576, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 640, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 704, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 704 + 63, tmp);
|
|
|
|
to += 3072;
|
|
break;
|
|
}
|
|
case 0x05:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
#ifdef NO_ZEROS
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1);
|
|
#else
|
|
const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
#endif
|
|
_mm_storeu_si128((__m128i *)to + 0, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 64, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 128, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 192, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 256, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 320, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 384, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 448, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 512, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 576, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 640, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 640 + 63, tmp);
|
|
|
|
to += 2816;
|
|
break;
|
|
}
|
|
case 0x06:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
#ifdef NO_ZEROS
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1);
|
|
#else
|
|
const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
#endif
|
|
_mm_storeu_si128((__m128i *)to + 0, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 64, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 128, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 192, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 256, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 320, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 384, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 448, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 512, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 576, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 576 + 63, tmp);
|
|
|
|
to += 2560;
|
|
break;
|
|
}
|
|
case 0x07:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
#ifdef NO_ZEROS
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1);
|
|
#else
|
|
const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
#endif
|
|
_mm_storeu_si128((__m128i *)to + 0, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 64, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 128, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 192, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 256, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 320, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 384, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 448, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 512, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 512 + 63, tmp);
|
|
|
|
to += 2304;
|
|
break;
|
|
}
|
|
case 0x08:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
#ifdef NO_ZEROS
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1);
|
|
#else
|
|
const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
#endif
|
|
_mm_storeu_si128((__m128i *)to + 0, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 64, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 128, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 192, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 256, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 320, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 384, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 448, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 448 + 63, tmp);
|
|
|
|
to += 2048;
|
|
break;
|
|
}
|
|
case 0x09:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
#ifdef NO_ZEROS
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1);
|
|
#else
|
|
const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
#endif
|
|
_mm_storeu_si128((__m128i *)to + 0, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 64, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 128, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 192, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 256, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 320, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 384, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 384 + 63, tmp);
|
|
|
|
to += 1792;
|
|
break;
|
|
}
|
|
case 0x0a:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
#ifdef NO_ZEROS
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1);
|
|
#else
|
|
const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
#endif
|
|
_mm_storeu_si128((__m128i *)to + 0, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 64, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 128, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 192, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 256, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 320, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 320 + 63, tmp);
|
|
|
|
to += 1536;
|
|
break;
|
|
}
|
|
case 0x0b:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
#ifdef NO_ZEROS
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1);
|
|
#else
|
|
const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
#endif
|
|
_mm_storeu_si128((__m128i *)to + 0, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 64, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 128, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 192, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 256, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 256 + 63, tmp);
|
|
|
|
to += 1280;
|
|
break;
|
|
}
|
|
case 0x0c:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
#ifdef NO_ZEROS
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1);
|
|
#else
|
|
const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
#endif
|
|
_mm_storeu_si128((__m128i *)to + 0, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 64, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 128, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 192, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 192 + 63, tmp);
|
|
|
|
to += 1024;
|
|
break;
|
|
}
|
|
case 0x0d:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
#ifdef NO_ZEROS
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1);
|
|
#else
|
|
const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
#endif
|
|
_mm_storeu_si128((__m128i *)to + 0, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 64, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 128, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 128 + 63, tmp);
|
|
|
|
to += 768;
|
|
break;
|
|
}
|
|
case 0x0e:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
#ifdef NO_ZEROS
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1);
|
|
#else
|
|
const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
#endif
|
|
_mm_storeu_si128((__m128i *)to + 0, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 63, tmp);
|
|
|
|
_mm_storeu_si128((__m128i *)to + 64, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 64 + 63, tmp);
|
|
|
|
to += 512;
|
|
break;
|
|
}
|
|
case 0x0f:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
#ifdef NO_ZEROS
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1);
|
|
#else
|
|
const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
#endif
|
|
_mm_storeu_si128((__m128i *)to + 0, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 16, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 17, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 18, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 19, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 20, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 21, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 22, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 23, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 24, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 25, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 26, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 27, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 28, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 29, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 30, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 31, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 32, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 33, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 34, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 35, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 36, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 37, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 38, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 39, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 40, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 41, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 42, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 43, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 44, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 45, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 46, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 47, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 48, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 49, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 50, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 51, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 52, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 53, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 54, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 55, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 56, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 57, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 58, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 59, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 60, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 61, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 62, tmp);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 63, tmp);
|
|
|
|
to += 256;
|
|
break;
|
|
}
|
|
case 0x10:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 256, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 288, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 320, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 352, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 384, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 416, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 448, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 480, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 480 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 480 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 480 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 480 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 480 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 480 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 480 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 480 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 480 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 480 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 480 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 480 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 480 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 480 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 480 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 480 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 480 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 480 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 480 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 480 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 480 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 480 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 480 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 480 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 480 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 480 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 480 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 480 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 480 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 480 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 480 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
to += 2048;
|
|
break;
|
|
}
|
|
case 0x11:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 256, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 288, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 320, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 352, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 384, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 416, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 448, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 448 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
to += 1920;
|
|
break;
|
|
}
|
|
case 0x12:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 256, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 288, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 320, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 352, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 384, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 416, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 416 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
to += 1792;
|
|
break;
|
|
}
|
|
case 0x13:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 256, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 288, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 320, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 352, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 384, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 384 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
to += 1664;
|
|
break;
|
|
}
|
|
case 0x14:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 256, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 288, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 320, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 352, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 352 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
to += 1536;
|
|
break;
|
|
}
|
|
case 0x15:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 256, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 288, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 320, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 320 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
to += 1408;
|
|
break;
|
|
}
|
|
case 0x16:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 256, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 288, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 288 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
to += 1280;
|
|
break;
|
|
}
|
|
case 0x17:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 256, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 256 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
to += 1152;
|
|
break;
|
|
}
|
|
case 0x18:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
to += 1024;
|
|
break;
|
|
}
|
|
case 0x19:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
to += 896;
|
|
break;
|
|
}
|
|
case 0x1a:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
to += 768;
|
|
break;
|
|
}
|
|
case 0x1b:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
to += 640;
|
|
break;
|
|
}
|
|
case 0x1c:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
to += 512;
|
|
break;
|
|
}
|
|
case 0x1d:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
to += 384;
|
|
break;
|
|
}
|
|
case 0x1e:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
to += 256;
|
|
break;
|
|
}
|
|
case 0x1f:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));
|
|
}
|
|
|
|
to += 128;
|
|
break;
|
|
}
|
|
case 0x20:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 144, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 176, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 208, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 208 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 208 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 208 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 208 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 208 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 208 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 208 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 208 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 208 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 208 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 208 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 208 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 208 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 208 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 208 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 240, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 240 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 240 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 240 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 240 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 240 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 240 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 240 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 240 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 240 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 240 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 240 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 240 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 240 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 240 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 240 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
to += 1024;
|
|
break;
|
|
}
|
|
case 0x21:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 144, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 176, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 208, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 208 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 208 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 208 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 208 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 208 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 208 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 208 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 208 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 208 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 208 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 208 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 208 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 208 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 208 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 208 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
to += 960;
|
|
break;
|
|
}
|
|
case 0x22:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 144, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 176, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 208, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 208 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 208 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 208 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 208 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 208 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 208 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 208 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 208 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 208 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 208 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 208 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 208 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 208 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 208 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 208 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
to += 896;
|
|
break;
|
|
}
|
|
case 0x23:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 144, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 176, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
to += 832;
|
|
break;
|
|
}
|
|
case 0x24:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 144, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 176, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 176 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
to += 768;
|
|
break;
|
|
}
|
|
case 0x25:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 144, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
to += 704;
|
|
break;
|
|
}
|
|
case 0x26:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 144, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 144 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
to += 640;
|
|
break;
|
|
}
|
|
case 0x27:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
to += 576;
|
|
break;
|
|
}
|
|
case 0x28:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
to += 512;
|
|
break;
|
|
}
|
|
case 0x29:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
to += 448;
|
|
break;
|
|
}
|
|
case 0x2a:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
to += 384;
|
|
break;
|
|
}
|
|
case 0x2b:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
to += 320;
|
|
break;
|
|
}
|
|
case 0x2c:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
to += 256;
|
|
break;
|
|
}
|
|
case 0x2d:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
to += 192;
|
|
break;
|
|
}
|
|
case 0x2e:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
to += 128;
|
|
break;
|
|
}
|
|
case 0x2f:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));
|
|
}
|
|
|
|
to += 64;
|
|
break;
|
|
}
|
|
case 0x30:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 100, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 110, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 110 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 110 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 110 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 110 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 110 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 110 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 110 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 110 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 110 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 120, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 120 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 120 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 120 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 120 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 120 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 120 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 120 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 120 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 120 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 130, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 130 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 130 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 130 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 130 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 130 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 130 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 130 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 130 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 130 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 140, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 140 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 140 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 140 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 140 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 140 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 140 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 140 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 140 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 140 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 150, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 150 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 150 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 150 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 150 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 150 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 150 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 150 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 150 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 150 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
to += 640;
|
|
break;
|
|
}
|
|
case 0x31:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 100, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 110, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 110 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 110 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 110 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 110 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 110 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 110 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 110 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 110 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 110 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 120, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 120 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 120 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 120 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 120 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 120 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 120 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 120 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 120 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 120 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 130, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 130 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 130 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 130 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 130 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 130 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 130 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 130 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 130 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 130 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 140, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 140 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 140 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 140 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 140 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 140 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 140 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 140 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 140 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 140 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
to += 600;
|
|
break;
|
|
}
|
|
case 0x32:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 100, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 110, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 110 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 110 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 110 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 110 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 110 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 110 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 110 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 110 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 110 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 120, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 120 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 120 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 120 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 120 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 120 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 120 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 120 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 120 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 120 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 130, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 130 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 130 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 130 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 130 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 130 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 130 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 130 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 130 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 130 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
to += 560;
|
|
break;
|
|
}
|
|
case 0x33:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 100, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 110, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 110 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 110 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 110 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 110 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 110 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 110 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 110 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 110 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 110 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 120, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 120 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 120 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 120 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 120 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 120 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 120 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 120 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 120 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 120 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
to += 520;
|
|
break;
|
|
}
|
|
case 0x34:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 100, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 110, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 110 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 110 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 110 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 110 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 110 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 110 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 110 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 110 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 110 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
to += 480;
|
|
break;
|
|
}
|
|
case 0x35:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 100, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 100 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
to += 440;
|
|
break;
|
|
}
|
|
case 0x36:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
to += 400;
|
|
break;
|
|
}
|
|
case 0x37:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
to += 360;
|
|
break;
|
|
}
|
|
case 0x38:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
to += 320;
|
|
break;
|
|
}
|
|
case 0x39:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
to += 280;
|
|
break;
|
|
}
|
|
case 0x3a:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
to += 240;
|
|
break;
|
|
}
|
|
case 0x3b:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
to += 200;
|
|
break;
|
|
}
|
|
case 0x3c:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
to += 160;
|
|
break;
|
|
}
|
|
case 0x3d:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
to += 120;
|
|
break;
|
|
}
|
|
case 0x3e:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
to += 80;
|
|
break;
|
|
}
|
|
case 0x3f:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));
|
|
}
|
|
|
|
to += 40;
|
|
break;
|
|
}
|
|
case 0x40:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 88, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 88 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 88 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 88 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 88 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 88 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 88 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 88 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 104, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 104 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 104 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 104 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 104 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 104 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 104 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 104 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 120, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 120 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 120 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 120 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 120 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 120 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 120 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 120 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
to += 512;
|
|
break;
|
|
}
|
|
case 0x41:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 88, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 88 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 88 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 88 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 88 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 88 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 88 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 88 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 104, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 104 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 104 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 104 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 104 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 104 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 104 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 104 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
to += 480;
|
|
break;
|
|
}
|
|
case 0x42:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 88, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 88 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 88 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 88 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 88 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 88 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 88 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 88 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 104, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 104 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 104 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 104 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 104 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 104 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 104 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 104 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
to += 448;
|
|
break;
|
|
}
|
|
case 0x43:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 88, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 88 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 88 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 88 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 88 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 88 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 88 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 88 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
to += 416;
|
|
break;
|
|
}
|
|
case 0x44:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 88, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 88 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 88 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 88 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 88 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 88 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 88 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 88 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
to += 384;
|
|
break;
|
|
}
|
|
case 0x45:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
to += 352;
|
|
break;
|
|
}
|
|
case 0x46:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
to += 320;
|
|
break;
|
|
}
|
|
case 0x47:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
to += 288;
|
|
break;
|
|
}
|
|
case 0x48:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
to += 256;
|
|
break;
|
|
}
|
|
case 0x49:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
to += 224;
|
|
break;
|
|
}
|
|
case 0x4a:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
to += 192;
|
|
break;
|
|
}
|
|
case 0x4b:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
to += 160;
|
|
break;
|
|
}
|
|
case 0x4c:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
to += 128;
|
|
break;
|
|
}
|
|
case 0x4d:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
to += 96;
|
|
break;
|
|
}
|
|
case 0x4e:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
to += 64;
|
|
break;
|
|
}
|
|
case 0x4f:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));
|
|
}
|
|
|
|
to += 32;
|
|
break;
|
|
}
|
|
case 0x50:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 66, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 66 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 66 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 66 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 66 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 66 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 78, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 78 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 78 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 78 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 78 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 78 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 84, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 84 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 84 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 84 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 84 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 84 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
to += 384;
|
|
break;
|
|
}
|
|
case 0x51:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 66, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 66 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 66 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 66 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 66 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 66 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 78, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 78 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 78 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 78 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 78 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 78 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 84, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 84 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 84 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 84 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 84 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 84 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
to += 360;
|
|
break;
|
|
}
|
|
case 0x52:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 66, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 66 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 66 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 66 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 66 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 66 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 78, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 78 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 78 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 78 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 78 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 78 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
to += 336;
|
|
break;
|
|
}
|
|
case 0x53:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 66, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 66 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 66 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 66 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 66 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 66 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
to += 312;
|
|
break;
|
|
}
|
|
case 0x54:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 66, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 66 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 66 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 66 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 66 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 66 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
to += 288;
|
|
break;
|
|
}
|
|
case 0x55:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
to += 264;
|
|
break;
|
|
}
|
|
case 0x56:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
to += 240;
|
|
break;
|
|
}
|
|
case 0x57:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
to += 216;
|
|
break;
|
|
}
|
|
case 0x58:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
to += 192;
|
|
break;
|
|
}
|
|
case 0x59:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
to += 168;
|
|
break;
|
|
}
|
|
case 0x5a:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
to += 144;
|
|
break;
|
|
}
|
|
case 0x5b:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
to += 120;
|
|
break;
|
|
}
|
|
case 0x5c:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
to += 96;
|
|
break;
|
|
}
|
|
case 0x5d:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
to += 72;
|
|
break;
|
|
}
|
|
case 0x5e:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
to += 48;
|
|
break;
|
|
}
|
|
case 0x5f:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));
|
|
}
|
|
|
|
to += 24;
|
|
break;
|
|
}
|
|
case 0x60:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 65, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 65 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 65 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 65 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 65 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 75, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 75 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 75 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 75 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 75 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
to += 320;
|
|
break;
|
|
}
|
|
case 0x61:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 65, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 65 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 65 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 65 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 65 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
to += 300;
|
|
break;
|
|
}
|
|
case 0x62:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 65, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 65 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 65 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 65 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 65 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
to += 280;
|
|
break;
|
|
}
|
|
case 0x63:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
to += 260;
|
|
break;
|
|
}
|
|
case 0x64:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
to += 240;
|
|
break;
|
|
}
|
|
case 0x65:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
to += 220;
|
|
break;
|
|
}
|
|
case 0x66:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
to += 200;
|
|
break;
|
|
}
|
|
case 0x67:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
to += 180;
|
|
break;
|
|
}
|
|
case 0x68:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
to += 160;
|
|
break;
|
|
}
|
|
case 0x69:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
to += 140;
|
|
break;
|
|
}
|
|
case 0x6a:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
to += 120;
|
|
break;
|
|
}
|
|
case 0x6b:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
to += 100;
|
|
break;
|
|
}
|
|
case 0x6c:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
to += 80;
|
|
break;
|
|
}
|
|
case 0x6d:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
to += 60;
|
|
break;
|
|
}
|
|
case 0x6e:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
to += 40;
|
|
break;
|
|
}
|
|
case 0x6f:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));
|
|
}
|
|
|
|
to += 20;
|
|
break;
|
|
}
|
|
case 0x70:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 81, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 81 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 99, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 99 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 99 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 99 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 99 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 99 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 99 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 99 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 99 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 108, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 108 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 108 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 108 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 108 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 108 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 108 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 108 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 108 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 117, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 117 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 117 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 117 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 117 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 117 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 117 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 117 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 117 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 126, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 126 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 126 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 126 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 126 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 126 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 126 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 126 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 126 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 135, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 135 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 135 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 135 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 135 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 135 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 135 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 135 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 135 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
to += 576;
|
|
break;
|
|
}
|
|
case 0x71:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 81, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 81 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 99, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 99 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 99 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 99 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 99 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 99 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 99 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 99 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 99 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 108, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 108 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 108 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 108 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 108 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 108 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 108 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 108 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 108 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 117, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 117 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 117 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 117 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 117 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 117 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 117 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 117 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 117 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 126, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 126 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 126 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 126 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 126 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 126 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 126 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 126 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 126 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
to += 540;
|
|
break;
|
|
}
|
|
case 0x72:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 81, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 81 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 99, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 99 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 99 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 99 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 99 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 99 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 99 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 99 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 99 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 108, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 108 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 108 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 108 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 108 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 108 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 108 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 108 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 108 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 117, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 117 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 117 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 117 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 117 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 117 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 117 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 117 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 117 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
to += 504;
|
|
break;
|
|
}
|
|
case 0x73:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 81, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 81 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 99, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 99 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 99 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 99 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 99 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 99 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 99 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 99 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 99 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 108, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 108 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 108 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 108 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 108 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 108 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 108 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 108 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 108 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
to += 468;
|
|
break;
|
|
}
|
|
case 0x74:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 81, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 81 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 99, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 99 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 99 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 99 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 99 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 99 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 99 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 99 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 99 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
to += 432;
|
|
break;
|
|
}
|
|
case 0x75:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 81, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 81 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
to += 396;
|
|
break;
|
|
}
|
|
case 0x76:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 81, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 81 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 81 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
to += 360;
|
|
break;
|
|
}
|
|
case 0x77:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 72 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
to += 324;
|
|
break;
|
|
}
|
|
case 0x78:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
to += 288;
|
|
break;
|
|
}
|
|
case 0x79:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
to += 252;
|
|
break;
|
|
}
|
|
case 0x7a:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
to += 216;
|
|
break;
|
|
}
|
|
case 0x7b:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
to += 180;
|
|
break;
|
|
}
|
|
case 0x7c:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
to += 144;
|
|
break;
|
|
}
|
|
case 0x7d:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
to += 108;
|
|
break;
|
|
}
|
|
case 0x7e:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
to += 72;
|
|
break;
|
|
}
|
|
case 0x7f:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));
|
|
}
|
|
|
|
to += 36;
|
|
break;
|
|
}
|
|
case 0x80:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 44, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 44 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 44 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 44 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 48, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 52, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 52 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 52 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 52 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 56, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 60, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
to += 256;
|
|
break;
|
|
}
|
|
case 0x81:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 44, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 44 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 44 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 44 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 48, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 52, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 52 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 52 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 52 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 56, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
to += 240;
|
|
break;
|
|
}
|
|
case 0x82:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 44, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 44 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 44 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 44 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 48, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 52, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 52 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 52 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 52 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
to += 224;
|
|
break;
|
|
}
|
|
case 0x83:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 44, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 44 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 44 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 44 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 48, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 48 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
to += 208;
|
|
break;
|
|
}
|
|
case 0x84:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 44, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 44 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 44 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 44 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
to += 192;
|
|
break;
|
|
}
|
|
case 0x85:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
to += 176;
|
|
break;
|
|
}
|
|
case 0x86:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
to += 160;
|
|
break;
|
|
}
|
|
case 0x87:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 32, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 32 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
to += 144;
|
|
break;
|
|
}
|
|
case 0x88:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
to += 128;
|
|
break;
|
|
}
|
|
case 0x89:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
to += 112;
|
|
break;
|
|
}
|
|
case 0x8a:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
to += 96;
|
|
break;
|
|
}
|
|
case 0x8b:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
to += 80;
|
|
break;
|
|
}
|
|
case 0x8c:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
to += 64;
|
|
break;
|
|
}
|
|
case 0x8d:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
to += 48;
|
|
break;
|
|
}
|
|
case 0x8e:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
to += 32;
|
|
break;
|
|
}
|
|
case 0x8f:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));
|
|
const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));
|
|
}
|
|
|
|
to += 16;
|
|
break;
|
|
}
|
|
case 0x90:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 49, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 49 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 49 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 49 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 49 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 49 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 49 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 77, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 77 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 77 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 77 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 77 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 77 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 77 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 84, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 84 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 84 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 84 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 84 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 84 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 84 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 91, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 91 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 91 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 91 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 91 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 91 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 91 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 98, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 98 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 98 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 98 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 98 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 98 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 98 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 105, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 105 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 105 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 105 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 105 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 105 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 105 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
to += 448;
|
|
break;
|
|
}
|
|
case 0x91:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 49, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 49 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 49 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 49 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 49 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 49 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 49 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 77, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 77 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 77 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 77 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 77 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 77 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 77 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 84, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 84 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 84 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 84 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 84 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 84 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 84 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 91, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 91 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 91 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 91 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 91 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 91 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 91 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 98, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 98 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 98 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 98 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 98 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 98 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 98 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
to += 420;
|
|
break;
|
|
}
|
|
case 0x92:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 49, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 49 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 49 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 49 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 49 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 49 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 49 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 77, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 77 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 77 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 77 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 77 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 77 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 77 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 84, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 84 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 84 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 84 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 84 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 84 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 84 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 91, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 91 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 91 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 91 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 91 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 91 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 91 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
to += 392;
|
|
break;
|
|
}
|
|
case 0x93:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 49, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 49 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 49 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 49 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 49 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 49 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 49 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 77, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 77 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 77 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 77 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 77 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 77 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 77 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 84, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 84 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 84 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 84 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 84 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 84 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 84 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
to += 364;
|
|
break;
|
|
}
|
|
case 0x94:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 49, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 49 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 49 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 49 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 49 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 49 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 49 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 77, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 77 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 77 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 77 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 77 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 77 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 77 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
to += 336;
|
|
break;
|
|
}
|
|
case 0x95:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 49, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 49 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 49 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 49 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 49 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 49 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 49 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
to += 308;
|
|
break;
|
|
}
|
|
case 0x96:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 49, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 49 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 49 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 49 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 49 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 49 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 49 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
to += 280;
|
|
break;
|
|
}
|
|
case 0x97:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 49, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 49 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 49 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 49 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 49 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 49 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 49 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
to += 252;
|
|
break;
|
|
}
|
|
case 0x98:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 49, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 49 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 49 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 49 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 49 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 49 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 49 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
to += 224;
|
|
break;
|
|
}
|
|
case 0x99:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
to += 196;
|
|
break;
|
|
}
|
|
case 0x9a:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
to += 168;
|
|
break;
|
|
}
|
|
case 0x9b:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
to += 140;
|
|
break;
|
|
}
|
|
case 0x9c:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
to += 112;
|
|
break;
|
|
}
|
|
case 0x9d:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
to += 84;
|
|
break;
|
|
}
|
|
case 0x9e:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
to += 56;
|
|
break;
|
|
}
|
|
case 0x9f:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));
|
|
}
|
|
|
|
to += 28;
|
|
break;
|
|
}
|
|
case 0xa0:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 39, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 39 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 39 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
to += 192;
|
|
break;
|
|
}
|
|
case 0xa1:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 39, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 39 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 39 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
to += 180;
|
|
break;
|
|
}
|
|
case 0xa2:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 39, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 39 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 39 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
to += 168;
|
|
break;
|
|
}
|
|
case 0xa3:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
to += 156;
|
|
break;
|
|
}
|
|
case 0xa4:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
to += 144;
|
|
break;
|
|
}
|
|
case 0xa5:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
to += 132;
|
|
break;
|
|
}
|
|
case 0xa6:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
to += 120;
|
|
break;
|
|
}
|
|
case 0xa7:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
to += 108;
|
|
break;
|
|
}
|
|
case 0xa8:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
to += 96;
|
|
break;
|
|
}
|
|
case 0xa9:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
to += 84;
|
|
break;
|
|
}
|
|
case 0xaa:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
to += 72;
|
|
break;
|
|
}
|
|
case 0xab:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
to += 60;
|
|
break;
|
|
}
|
|
case 0xac:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
to += 48;
|
|
break;
|
|
}
|
|
case 0xad:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
to += 36;
|
|
break;
|
|
}
|
|
case 0xae:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
to += 24;
|
|
break;
|
|
}
|
|
case 0xaf:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));
|
|
}
|
|
|
|
to += 12;
|
|
break;
|
|
}
|
|
case 0xb0:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 65, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 65 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 65 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 65 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 65 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 75, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 75 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 75 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 75 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 75 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
to += 320;
|
|
break;
|
|
}
|
|
case 0xb1:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 65, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 65 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 65 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 65 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 65 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
to += 300;
|
|
break;
|
|
}
|
|
case 0xb2:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 65, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 65 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 65 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 65 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 65 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
to += 280;
|
|
break;
|
|
}
|
|
case 0xb3:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
to += 260;
|
|
break;
|
|
}
|
|
case 0xb4:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
to += 240;
|
|
break;
|
|
}
|
|
case 0xb5:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
to += 220;
|
|
break;
|
|
}
|
|
case 0xb6:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
to += 200;
|
|
break;
|
|
}
|
|
case 0xb7:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
to += 180;
|
|
break;
|
|
}
|
|
case 0xb8:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
to += 160;
|
|
break;
|
|
}
|
|
case 0xb9:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
to += 140;
|
|
break;
|
|
}
|
|
case 0xba:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
to += 120;
|
|
break;
|
|
}
|
|
case 0xbb:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
to += 100;
|
|
break;
|
|
}
|
|
case 0xbc:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
to += 80;
|
|
break;
|
|
}
|
|
case 0xbd:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
to += 60;
|
|
break;
|
|
}
|
|
case 0xbe:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
to += 40;
|
|
break;
|
|
}
|
|
case 0xbf:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));
|
|
}
|
|
|
|
to += 20;
|
|
break;
|
|
}
|
|
case 0xc0:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 22, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 22 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 26, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 26 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
to += 128;
|
|
break;
|
|
}
|
|
case 0xc1:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 22, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 22 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 26, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 26 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
to += 120;
|
|
break;
|
|
}
|
|
case 0xc2:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 22, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 22 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 26, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 26 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
to += 112;
|
|
break;
|
|
}
|
|
case 0xc3:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 22, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 22 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
to += 104;
|
|
break;
|
|
}
|
|
case 0xc4:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 22, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 22 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
to += 96;
|
|
break;
|
|
}
|
|
case 0xc5:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
to += 88;
|
|
break;
|
|
}
|
|
case 0xc6:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
to += 80;
|
|
break;
|
|
}
|
|
case 0xc7:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
to += 72;
|
|
break;
|
|
}
|
|
case 0xc8:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 14, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 14 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
to += 64;
|
|
break;
|
|
}
|
|
case 0xc9:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
to += 56;
|
|
break;
|
|
}
|
|
case 0xca:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
to += 48;
|
|
break;
|
|
}
|
|
case 0xcb:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
to += 40;
|
|
break;
|
|
}
|
|
case 0xcc:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
to += 32;
|
|
break;
|
|
}
|
|
case 0xcd:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
to += 24;
|
|
break;
|
|
}
|
|
case 0xce:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
to += 16;
|
|
break;
|
|
}
|
|
case 0xcf:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));
|
|
}
|
|
|
|
to += 8;
|
|
break;
|
|
}
|
|
case 0xd0:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 39, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 39 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 39 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
to += 192;
|
|
break;
|
|
}
|
|
case 0xd1:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 39, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 39 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 39 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
to += 180;
|
|
break;
|
|
}
|
|
case 0xd2:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 39, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 39 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 39 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
to += 168;
|
|
break;
|
|
}
|
|
case 0xd3:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
to += 156;
|
|
break;
|
|
}
|
|
case 0xd4:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
to += 144;
|
|
break;
|
|
}
|
|
case 0xd5:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
to += 132;
|
|
break;
|
|
}
|
|
case 0xd6:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
to += 120;
|
|
break;
|
|
}
|
|
case 0xd7:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
to += 108;
|
|
break;
|
|
}
|
|
case 0xd8:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
to += 96;
|
|
break;
|
|
}
|
|
case 0xd9:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
to += 84;
|
|
break;
|
|
}
|
|
case 0xda:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
to += 72;
|
|
break;
|
|
}
|
|
case 0xdb:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
to += 60;
|
|
break;
|
|
}
|
|
case 0xdc:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
to += 48;
|
|
break;
|
|
}
|
|
case 0xdd:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
to += 36;
|
|
break;
|
|
}
|
|
case 0xde:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
to += 24;
|
|
break;
|
|
}
|
|
case 0xdf:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i byte_stream = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21));
|
|
const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));
|
|
_mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));
|
|
}
|
|
|
|
to += 12;
|
|
break;
|
|
}
|
|
case 0xe0:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 1, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 2, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 4, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 7, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 11, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 13, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 14, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 15, tmp);
|
|
}
|
|
|
|
to += 64;
|
|
break;
|
|
}
|
|
case 0xe1:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 1, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 2, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 4, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 7, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 11, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 13, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 14, tmp);
|
|
}
|
|
|
|
to += 60;
|
|
break;
|
|
}
|
|
case 0xe2:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 1, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 2, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 4, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 7, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 11, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 13, tmp);
|
|
}
|
|
|
|
to += 56;
|
|
break;
|
|
}
|
|
case 0xe3:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 1, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 2, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 4, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 7, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 11, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 12, tmp);
|
|
}
|
|
|
|
to += 52;
|
|
break;
|
|
}
|
|
case 0xe4:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 1, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 2, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 4, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 7, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 11, tmp);
|
|
}
|
|
|
|
to += 48;
|
|
break;
|
|
}
|
|
case 0xe5:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 1, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 2, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 4, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 7, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 10, tmp);
|
|
}
|
|
|
|
to += 44;
|
|
break;
|
|
}
|
|
case 0xe6:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 1, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 2, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 4, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 7, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 9, tmp);
|
|
}
|
|
|
|
to += 40;
|
|
break;
|
|
}
|
|
case 0xe7:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 1, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 2, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 4, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 7, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 8, tmp);
|
|
}
|
|
|
|
to += 36;
|
|
break;
|
|
}
|
|
case 0xe8:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 1, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 2, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 4, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 7, tmp);
|
|
}
|
|
|
|
to += 32;
|
|
break;
|
|
}
|
|
case 0xe9:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 1, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 2, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 4, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 6, tmp);
|
|
}
|
|
|
|
to += 28;
|
|
break;
|
|
}
|
|
case 0xea:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 1, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 2, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 4, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 5, tmp);
|
|
}
|
|
|
|
to += 24;
|
|
break;
|
|
}
|
|
case 0xeb:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 1, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 2, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 4, tmp);
|
|
}
|
|
|
|
to += 20;
|
|
break;
|
|
}
|
|
case 0xec:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 1, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 2, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 3, tmp);
|
|
}
|
|
|
|
to += 16;
|
|
break;
|
|
}
|
|
case 0xed:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 1, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 2, tmp);
|
|
}
|
|
|
|
to += 12;
|
|
break;
|
|
}
|
|
case 0xee:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, tmp);
|
|
}
|
|
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 1, tmp);
|
|
}
|
|
|
|
to += 8;
|
|
break;
|
|
}
|
|
case 0xef:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((__m128i *)in);
|
|
_mm_prefetch(in += 16, _MM_HINT_NTA);
|
|
_mm_storeu_si128((__m128i *)to + 0, tmp);
|
|
}
|
|
|
|
to += 4;
|
|
break;
|
|
}
|
|
case 0xf0:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
_mm_prefetch(in + 1, _MM_HINT_NTA);
|
|
*(to + 0) = *(uint8_t *)(in + 0);
|
|
_mm_prefetch(in + 2, _MM_HINT_NTA);
|
|
*(to + 1) = *(uint8_t *)(in + 1);
|
|
_mm_prefetch(in + 3, _MM_HINT_NTA);
|
|
*(to + 2) = *(uint8_t *)(in + 2);
|
|
_mm_prefetch(in + 4, _MM_HINT_NTA);
|
|
*(to + 3) = *(uint8_t *)(in + 3);
|
|
in += 4;
|
|
to += 4;
|
|
break;
|
|
}
|
|
case 0xf1:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
_mm_prefetch(in + 1, _MM_HINT_NTA);
|
|
*(to + 0) = *(uint8_t *)(in + 0);
|
|
_mm_prefetch(in + 2, _MM_HINT_NTA);
|
|
*(to + 1) = *(uint8_t *)(in + 1);
|
|
_mm_prefetch(in + 3, _MM_HINT_NTA);
|
|
*(to + 2) = *(uint8_t *)(in + 2);
|
|
in += 3;
|
|
to += 3;
|
|
break;
|
|
}
|
|
case 0xf2:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
_mm_prefetch(in + 1, _MM_HINT_NTA);
|
|
*(to + 0) = *(uint8_t *)(in + 0);
|
|
_mm_prefetch(in + 2, _MM_HINT_NTA);
|
|
*(to + 1) = *(uint8_t *)(in + 1);
|
|
in += 2;
|
|
to += 2;
|
|
break;
|
|
}
|
|
case 0xf3:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
_mm_prefetch(in + 1, _MM_HINT_NTA);
|
|
*(to + 0) = *(uint8_t *)(in + 0);
|
|
in += 1;
|
|
to += 1;
|
|
break;
|
|
}
|
|
case 0xf4:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
_mm_prefetch(in + 2 * 1, _MM_HINT_NTA);
|
|
*(to + 0) = *(uint16_t *)(in + 2 * 0);
|
|
_mm_prefetch(in + 2 * 2, _MM_HINT_NTA);
|
|
*(to + 1) = *(uint16_t *)(in + 2 * 1);
|
|
_mm_prefetch(in + 2 * 3, _MM_HINT_NTA);
|
|
*(to + 2) = *(uint16_t *)(in + 2 * 2);
|
|
_mm_prefetch(in + 2 * 4, _MM_HINT_NTA);
|
|
*(to + 3) = *(uint16_t *)(in + 2 * 3);
|
|
in += 2 * 4;
|
|
to += 4;
|
|
break;
|
|
}
|
|
case 0xf5:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
_mm_prefetch(in + 2 * 1, _MM_HINT_NTA);
|
|
*(to + 0) = *(uint16_t *)(in + 2 * 0);
|
|
_mm_prefetch(in + 2 * 2, _MM_HINT_NTA);
|
|
*(to + 1) = *(uint16_t *)(in + 2 * 1);
|
|
_mm_prefetch(in + 2 * 3, _MM_HINT_NTA);
|
|
*(to + 2) = *(uint16_t *)(in + 2 * 2);
|
|
in += 2 * 3;
|
|
to += 3;
|
|
break;
|
|
}
|
|
case 0xf6:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
_mm_prefetch(in + 2 * 1, _MM_HINT_NTA);
|
|
*(to + 0) = *(uint16_t *)(in + 2 * 0);
|
|
_mm_prefetch(in + 2 * 2, _MM_HINT_NTA);
|
|
*(to + 1) = *(uint16_t *)(in + 2 * 1);
|
|
in += 2 * 2;
|
|
to += 2;
|
|
break;
|
|
}
|
|
case 0xf7:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
_mm_prefetch(in + 2 * 1, _MM_HINT_NTA);
|
|
*(to + 0) = *(uint16_t *)(in + 2 * 0);
|
|
in += 2 * 1;
|
|
to += 1;
|
|
break;
|
|
}
|
|
case 0xf8:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
_mm_prefetch(in + 3 * 1, _MM_HINT_NTA);
|
|
*(to + 0) = (*(uint8_t *)(in + 3 * 0) << 16) | (*(uint8_t *)(in + 3 * 0 + 1) << 8) | (*(uint8_t *)(in + 3 * 0 + 2));
|
|
_mm_prefetch(in + 3 * 2, _MM_HINT_NTA);
|
|
*(to + 1) = (*(uint8_t *)(in + 3 * 1) << 16) | (*(uint8_t *)(in + 3 * 1 + 1) << 8) | (*(uint8_t *)(in + 3 * 1 + 2));
|
|
_mm_prefetch(in + 3 * 3, _MM_HINT_NTA);
|
|
*(to + 2) = (*(uint8_t *)(in + 3 * 2) << 16) | (*(uint8_t *)(in + 3 * 2 + 1) << 8) | (*(uint8_t *)(in + 3 * 2 + 2));
|
|
_mm_prefetch(in + 3 * 4, _MM_HINT_NTA);
|
|
*(to + 3) = (*(uint8_t *)(in + 3 * 3) << 16) | (*(uint8_t *)(in + 3 * 3 + 1) << 8) | (*(uint8_t *)(in + 3 * 3 + 2));
|
|
in += 3 * 4;
|
|
to += 4;
|
|
break;
|
|
}
|
|
case 0xf9:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
_mm_prefetch(in + 3 * 1, _MM_HINT_NTA);
|
|
*(to + 0) = (*(uint8_t *)(in + 3 * 0) << 16) | (*(uint8_t *)(in + 3 * 0 + 1) << 8) | (*(uint8_t *)(in + 3 * 0 + 2));
|
|
_mm_prefetch(in + 3 * 2, _MM_HINT_NTA);
|
|
*(to + 1) = (*(uint8_t *)(in + 3 * 1) << 16) | (*(uint8_t *)(in + 3 * 1 + 1) << 8) | (*(uint8_t *)(in + 3 * 1 + 2));
|
|
_mm_prefetch(in + 3 * 3, _MM_HINT_NTA);
|
|
*(to + 2) = (*(uint8_t *)(in + 3 * 2) << 16) | (*(uint8_t *)(in + 3 * 2 + 1) << 8) | (*(uint8_t *)(in + 3 * 2 + 2));
|
|
in += 3 * 3;
|
|
to += 3;
|
|
break;
|
|
}
|
|
case 0xfa:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
_mm_prefetch(in + 3 * 1, _MM_HINT_NTA);
|
|
*(to + 0) = (*(uint8_t *)(in + 3 * 0) << 16) | (*(uint8_t *)(in + 3 * 0 + 1) << 8) | (*(uint8_t *)(in + 3 * 0 + 2));
|
|
_mm_prefetch(in + 3 * 2, _MM_HINT_NTA);
|
|
*(to + 1) = (*(uint8_t *)(in + 3 * 1) << 16) | (*(uint8_t *)(in + 3 * 1 + 1) << 8) | (*(uint8_t *)(in + 3 * 1 + 2));
|
|
in += 3 * 2;
|
|
to += 2;
|
|
break;
|
|
}
|
|
case 0xfb:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
_mm_prefetch(in + 3 * 1, _MM_HINT_NTA);
|
|
*(to + 0) = (*(uint8_t *)(in + 3 * 0) << 16) | (*(uint8_t *)(in + 3 * 0 + 1) << 8) | (*(uint8_t *)(in + 3 * 0 + 2));
|
|
in += 3 * 1;
|
|
to += 1;
|
|
break;
|
|
}
|
|
case 0xfc:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
_mm_prefetch(in + 4 * 1, _MM_HINT_NTA);
|
|
*(to + 0) = *(uint32_t *)(in + 4 * 0);
|
|
_mm_prefetch(in + 4 * 2, _MM_HINT_NTA);
|
|
*(to + 1) = *(uint32_t *)(in + 4 * 1);
|
|
_mm_prefetch(in + 4 * 3, _MM_HINT_NTA);
|
|
*(to + 2) = *(uint32_t *)(in + 4 * 2);
|
|
_mm_prefetch(in + 4 * 4, _MM_HINT_NTA);
|
|
*(to + 3) = *(uint32_t *)(in + 4 * 3);
|
|
in += 4 * 4;
|
|
to += 4;
|
|
break;
|
|
}
|
|
case 0xfd:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
_mm_prefetch(in + 4 * 1, _MM_HINT_NTA);
|
|
*(to + 0) = *(uint32_t *)(in + 4 * 0);
|
|
_mm_prefetch(in + 4 * 2, _MM_HINT_NTA);
|
|
*(to + 1) = *(uint32_t *)(in + 4 * 1);
|
|
_mm_prefetch(in + 4 * 3, _MM_HINT_NTA);
|
|
*(to + 2) = *(uint32_t *)(in + 4 * 2);
|
|
in += 4 * 3;
|
|
to += 3;
|
|
break;
|
|
}
|
|
case 0xfe:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
_mm_prefetch(in + 4 * 1, _MM_HINT_NTA);
|
|
*(to + 0) = *(uint32_t *)(in + 4 * 0);
|
|
_mm_prefetch(in + 4 * 2, _MM_HINT_NTA);
|
|
*(to + 1) = *(uint32_t *)(in + 4 * 1);
|
|
in += 4 * 2;
|
|
to += 2;
|
|
break;
|
|
}
|
|
case 0xff:
|
|
_mm_prefetch(keys, _MM_HINT_NTA);
|
|
{
|
|
_mm_prefetch(in + 4 * 1, _MM_HINT_NTA);
|
|
*(to + 0) = *(uint32_t *)(in + 4 * 0);
|
|
in += 4 * 1;
|
|
to += 1;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|