1330 lines
68 KiB
C
1330 lines
68 KiB
C
/**
|
|
Copyright (C) powturbo 2013-2023
|
|
SPDX-License-Identifier: GPL v2 License
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation; either version 2 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License along
|
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
|
|
- homepage : https://sites.google.com/site/powturbo/
|
|
- github : https://github.com/powturbo
|
|
- twitter : https://twitter.com/powturbo
|
|
- email : powturbo [_AT_] gmail [_DOT_] com
|
|
**/
|
|
// v8.c - "Integer Compression" TurboByte 16/32 bits (SIMD Group Varint, Streamvbyte family)
|
|
#ifndef V8ENC
|
|
#pragma warning( disable : 4005)
|
|
#pragma warning( disable : 4090)
|
|
#pragma warning( disable : 4068)
|
|
|
|
#include "include_/conf.h"
|
|
#include "include_/vint.h"
|
|
#include "include_/bitutil_.h"
|
|
|
|
#define V8PAYLOAD(_n_, _usize_) (((_n_)*(_usize_/16)+7)/8)
|
|
#define V8BOUND(_n_, _usize_) (V8PAYLOAD(_n_, _usize_)+ (_n_)*(_usize_/8))
|
|
|
|
size_t v8bound16(size_t n) { return V8BOUND(n, 16); }
|
|
size_t v8bound32(size_t n) { return V8BOUND(n, 32); }
|
|
|
|
size_t v8len16(const uint16_t *in, size_t n) {
|
|
size_t c = 0;
|
|
uint16_t *ip;
|
|
for(ip = in; ip < in + n; ip++)
|
|
c += ip[0]?(bsr16(ip[0]) + 7)/8:1;
|
|
return c + V8PAYLOAD(n, 16);
|
|
}
|
|
|
|
size_t v8len32(const uint32_t *in, size_t n) {
|
|
size_t c = 0;
|
|
uint32_t *ip;
|
|
for(ip = in; ip < in+n; ip++)
|
|
c += ip[0]?(bsr32(ip[0]) + 7)/8:1;
|
|
return c + V8PAYLOAD(n, 32);
|
|
}
|
|
|
|
#define LEN32(_m_,_i_) len32[(uint8_t)(_m_>>(_i_*8))]
|
|
static const unsigned char len32[256] = {
|
|
4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9,10,
|
|
5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9,10, 8, 9,10,11,
|
|
6, 7, 8, 9, 7, 8, 9,10, 8, 9,10,11, 9,10,11,12,
|
|
7, 8, 9,10, 8, 9,10,11, 9,10,11,12,10,11,12,13,
|
|
5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9,10, 8, 9,10,11,
|
|
6, 7, 8, 9, 7, 8, 9,10, 8, 9,10,11, 9,10,11,12,
|
|
7, 8, 9,10, 8, 9,10,11, 9,10,11,12,10,11,12,13,
|
|
8, 9,10,11, 9,10,11,12,10,11,12,13,11,12,13,14,
|
|
6, 7, 8, 9, 7, 8, 9,10, 8, 9,10,11, 9,10,11,12,
|
|
7, 8, 9,10, 8, 9,10,11, 9,10,11,12,10,11,12,13,
|
|
8, 9,10,11, 9,10,11,12,10,11,12,13,11,12,13,14,
|
|
9,10,11,12,10,11,12,13,11,12,13,14,12,13,14,15,
|
|
7, 8, 9,10, 8, 9,10,11, 9,10,11,12,10,11,12,13,
|
|
8, 9,10,11, 9,10,11,12,10,11,12,13,11,12,13,14,
|
|
9,10,11,12,10,11,12,13,11,12,13,14,12,13,14,15,
|
|
10,11,12,13,11,12,13,14,12,13,14,15,13,14,15,16
|
|
};
|
|
|
|
#define _ 0
|
|
#define SVE32(_m_) _mm_loadu_si128((__m128i*)&sve32[(_m_) & 0x3f0])
|
|
static const ALIGNED(uint8_t, sve32[64*16],16) = {
|
|
_, 4, 8,12,13,14,15, _, _, _, _, _, _, _, _, _,
|
|
_, 1, 4, 8,12,13,14,15, _, _, _, _, _, _, _, _,
|
|
_, 1, 2, 4, 8,12,13,14,15, _, _, _, _, _, _, _,
|
|
_, 1, 2, 3, 4, 8,12,13,14,15, _, _, _, _, _, _,
|
|
_, 4, 5, 8,12,13,14,15, _, _, _, _, _, _, _, _,
|
|
_, 1, 4, 5, 8,12,13,14,15, _, _, _, _, _, _, _,
|
|
_, 1, 2, 4, 5, 8,12,13,14,15, _, _, _, _, _, _,
|
|
_, 1, 2, 3, 4, 5, 8,12,13,14,15, _, _, _, _, _,
|
|
_, 4, 5, 6, 8,12,13,14,15, _, _, _, _, _, _, _,
|
|
_, 1, 4, 5, 6, 8,12,13,14,15, _, _, _, _, _, _,
|
|
_, 1, 2, 4, 5, 6, 8,12,13,14,15, _, _, _, _, _,
|
|
_, 1, 2, 3, 4, 5, 6, 8,12,13,14,15, _, _, _, _,
|
|
_, 4, 5, 6, 7, 8,12,13,14,15, _, _, _, _, _, _,
|
|
_, 1, 4, 5, 6, 7, 8,12,13,14,15, _, _, _, _, _,
|
|
_, 1, 2, 4, 5, 6, 7, 8,12,13,14,15, _, _, _, _,
|
|
_, 1, 2, 3, 4, 5, 6, 7, 8,12,13,14,15, _, _, _,
|
|
_, 4, 8, 9,12,13,14,15, _, _, _, _, _, _, _, _,
|
|
_, 1, 4, 8, 9,12,13,14,15, _, _, _, _, _, _, _,
|
|
_, 1, 2, 4, 8, 9,12,13,14,15, _, _, _, _, _, _,
|
|
_, 1, 2, 3, 4, 8, 9,12,13,14,15, _, _, _, _, _,
|
|
_, 4, 5, 8, 9,12,13,14,15, _, _, _, _, _, _, _,
|
|
_, 1, 4, 5, 8, 9,12,13,14,15, _, _, _, _, _, _,
|
|
_, 1, 2, 4, 5, 8, 9,12,13,14,15, _, _, _, _, _,
|
|
_, 1, 2, 3, 4, 5, 8, 9,12,13,14,15, _, _, _, _,
|
|
_, 4, 5, 6, 8, 9,12,13,14,15, _, _, _, _, _, _,
|
|
_, 1, 4, 5, 6, 8, 9,12,13,14,15, _, _, _, _, _,
|
|
_, 1, 2, 4, 5, 6, 8, 9,12,13,14,15, _, _, _, _,
|
|
_, 1, 2, 3, 4, 5, 6, 8, 9,12,13,14,15, _, _, _,
|
|
_, 4, 5, 6, 7, 8, 9,12,13,14,15, _, _, _, _, _,
|
|
_, 1, 4, 5, 6, 7, 8, 9,12,13,14,15, _, _, _, _,
|
|
_, 1, 2, 4, 5, 6, 7, 8, 9,12,13,14,15, _, _, _,
|
|
_, 1, 2, 3, 4, 5, 6, 7, 8, 9,12,13,14,15, _, _,
|
|
_, 4, 8, 9,10,12,13,14,15, _, _, _, _, _, _, _,
|
|
_, 1, 4, 8, 9,10,12,13,14,15, _, _, _, _, _, _,
|
|
_, 1, 2, 4, 8, 9,10,12,13,14,15, _, _, _, _, _,
|
|
_, 1, 2, 3, 4, 8, 9,10,12,13,14,15, _, _, _, _,
|
|
_, 4, 5, 8, 9,10,12,13,14,15, _, _, _, _, _, _,
|
|
_, 1, 4, 5, 8, 9,10,12,13,14,15, _, _, _, _, _,
|
|
_, 1, 2, 4, 5, 8, 9,10,12,13,14,15, _, _, _, _,
|
|
_, 1, 2, 3, 4, 5, 8, 9,10,12,13,14,15, _, _, _,
|
|
_, 4, 5, 6, 8, 9,10,12,13,14,15, _, _, _, _, _,
|
|
_, 1, 4, 5, 6, 8, 9,10,12,13,14,15, _, _, _, _,
|
|
_, 1, 2, 4, 5, 6, 8, 9,10,12,13,14,15, _, _, _,
|
|
_, 1, 2, 3, 4, 5, 6, 8, 9,10,12,13,14,15, _, _,
|
|
_, 4, 5, 6, 7, 8, 9,10,12,13,14,15, _, _, _, _,
|
|
_, 1, 4, 5, 6, 7, 8, 9,10,12,13,14,15, _, _, _,
|
|
_, 1, 2, 4, 5, 6, 7, 8, 9,10,12,13,14,15, _, _,
|
|
_, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,12,13,14,15, _,
|
|
_, 4, 8, 9,10,11,12,13,14,15, _, _, _, _, _, _,
|
|
_, 1, 4, 8, 9,10,11,12,13,14,15, _, _, _, _, _,
|
|
_, 1, 2, 4, 8, 9,10,11,12,13,14,15, _, _, _, _,
|
|
_, 1, 2, 3, 4, 8, 9,10,11,12,13,14,15, _, _, _,
|
|
_, 4, 5, 8, 9,10,11,12,13,14,15, _, _, _, _, _,
|
|
_, 1, 4, 5, 8, 9,10,11,12,13,14,15, _, _, _, _,
|
|
_, 1, 2, 4, 5, 8, 9,10,11,12,13,14,15, _, _, _,
|
|
_, 1, 2, 3, 4, 5, 8, 9,10,11,12,13,14,15, _, _,
|
|
_, 4, 5, 6, 8, 9,10,11,12,13,14,15, _, _, _, _,
|
|
_, 1, 4, 5, 6, 8, 9,10,11,12,13,14,15, _, _, _,
|
|
_, 1, 2, 4, 5, 6, 8, 9,10,11,12,13,14,15, _, _,
|
|
_, 1, 2, 3, 4, 5, 6, 8, 9,10,11,12,13,14,15, _,
|
|
_, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15, _, _, _,
|
|
_, 1, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15, _, _,
|
|
_, 1, 2, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15, _,
|
|
_, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15
|
|
};
|
|
|
|
#define SVE16(_m_) _mm_loadu_si128((__m128i*)&sve16[(_m_) & 0x7f0])
|
|
static const ALIGNED(uint8_t, sve16[128*16],16) = {
|
|
_, 2, 4, 6, 8,10,12,14,15, _, _, _, _, _, _, _,
|
|
_, 1, 2, 4, 6, 8,10,12,14,15, _, _, _, _, _, _,
|
|
_, 2, 3, 4, 6, 8,10,12,14,15, _, _, _, _, _, _,
|
|
_, 1, 2, 3, 4, 6, 8,10,12,14,15, _, _, _, _, _,
|
|
_, 2, 4, 5, 6, 8,10,12,14,15, _, _, _, _, _, _,
|
|
_, 1, 2, 4, 5, 6, 8,10,12,14,15, _, _, _, _, _,
|
|
_, 2, 3, 4, 5, 6, 8,10,12,14,15, _, _, _, _, _,
|
|
_, 1, 2, 3, 4, 5, 6, 8,10,12,14,15, _, _, _, _,
|
|
_, 2, 4, 6, 7, 8,10,12,14,15, _, _, _, _, _, _,
|
|
_, 1, 2, 4, 6, 7, 8,10,12,14,15, _, _, _, _, _,
|
|
_, 2, 3, 4, 6, 7, 8,10,12,14,15, _, _, _, _, _,
|
|
_, 1, 2, 3, 4, 6, 7, 8,10,12,14,15, _, _, _, _,
|
|
_, 2, 4, 5, 6, 7, 8,10,12,14,15, _, _, _, _, _,
|
|
_, 1, 2, 4, 5, 6, 7, 8,10,12,14,15, _, _, _, _,
|
|
_, 2, 3, 4, 5, 6, 7, 8,10,12,14,15, _, _, _, _,
|
|
_, 1, 2, 3, 4, 5, 6, 7, 8,10,12,14,15, _, _, _,
|
|
_, 2, 4, 6, 8, 9,10,12,14,15, _, _, _, _, _, _,
|
|
_, 1, 2, 4, 6, 8, 9,10,12,14,15, _, _, _, _, _,
|
|
_, 2, 3, 4, 6, 8, 9,10,12,14,15, _, _, _, _, _,
|
|
_, 1, 2, 3, 4, 6, 8, 9,10,12,14,15, _, _, _, _,
|
|
_, 2, 4, 5, 6, 8, 9,10,12,14,15, _, _, _, _, _,
|
|
_, 1, 2, 4, 5, 6, 8, 9,10,12,14,15, _, _, _, _,
|
|
_, 2, 3, 4, 5, 6, 8, 9,10,12,14,15, _, _, _, _,
|
|
_, 1, 2, 3, 4, 5, 6, 8, 9,10,12,14,15, _, _, _,
|
|
_, 2, 4, 6, 7, 8, 9,10,12,14,15, _, _, _, _, _,
|
|
_, 1, 2, 4, 6, 7, 8, 9,10,12,14,15, _, _, _, _,
|
|
_, 2, 3, 4, 6, 7, 8, 9,10,12,14,15, _, _, _, _,
|
|
_, 1, 2, 3, 4, 6, 7, 8, 9,10,12,14,15, _, _, _,
|
|
_, 2, 4, 5, 6, 7, 8, 9,10,12,14,15, _, _, _, _,
|
|
_, 1, 2, 4, 5, 6, 7, 8, 9,10,12,14,15, _, _, _,
|
|
_, 2, 3, 4, 5, 6, 7, 8, 9,10,12,14,15, _, _, _,
|
|
_, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,12,14,15, _, _,
|
|
_, 2, 4, 6, 8,10,11,12,14,15, _, _, _, _, _, _,
|
|
_, 1, 2, 4, 6, 8,10,11,12,14,15, _, _, _, _, _,
|
|
_, 2, 3, 4, 6, 8,10,11,12,14,15, _, _, _, _, _,
|
|
_, 1, 2, 3, 4, 6, 8,10,11,12,14,15, _, _, _, _,
|
|
_, 2, 4, 5, 6, 8,10,11,12,14,15, _, _, _, _, _,
|
|
_, 1, 2, 4, 5, 6, 8,10,11,12,14,15, _, _, _, _,
|
|
_, 2, 3, 4, 5, 6, 8,10,11,12,14,15, _, _, _, _,
|
|
_, 1, 2, 3, 4, 5, 6, 8,10,11,12,14,15, _, _, _,
|
|
_, 2, 4, 6, 7, 8,10,11,12,14,15, _, _, _, _, _,
|
|
_, 1, 2, 4, 6, 7, 8,10,11,12,14,15, _, _, _, _,
|
|
_, 2, 3, 4, 6, 7, 8,10,11,12,14,15, _, _, _, _,
|
|
_, 1, 2, 3, 4, 6, 7, 8,10,11,12,14,15, _, _, _,
|
|
_, 2, 4, 5, 6, 7, 8,10,11,12,14,15, _, _, _, _,
|
|
_, 1, 2, 4, 5, 6, 7, 8,10,11,12,14,15, _, _, _,
|
|
_, 2, 3, 4, 5, 6, 7, 8,10,11,12,14,15, _, _, _,
|
|
_, 1, 2, 3, 4, 5, 6, 7, 8,10,11,12,14,15, _, _,
|
|
_, 2, 4, 6, 8, 9,10,11,12,14,15, _, _, _, _, _,
|
|
_, 1, 2, 4, 6, 8, 9,10,11,12,14,15, _, _, _, _,
|
|
_, 2, 3, 4, 6, 8, 9,10,11,12,14,15, _, _, _, _,
|
|
_, 1, 2, 3, 4, 6, 8, 9,10,11,12,14,15, _, _, _,
|
|
_, 2, 4, 5, 6, 8, 9,10,11,12,14,15, _, _, _, _,
|
|
_, 1, 2, 4, 5, 6, 8, 9,10,11,12,14,15, _, _, _,
|
|
_, 2, 3, 4, 5, 6, 8, 9,10,11,12,14,15, _, _, _,
|
|
_, 1, 2, 3, 4, 5, 6, 8, 9,10,11,12,14,15, _, _,
|
|
_, 2, 4, 6, 7, 8, 9,10,11,12,14,15, _, _, _, _,
|
|
_, 1, 2, 4, 6, 7, 8, 9,10,11,12,14,15, _, _, _,
|
|
_, 2, 3, 4, 6, 7, 8, 9,10,11,12,14,15, _, _, _,
|
|
_, 1, 2, 3, 4, 6, 7, 8, 9,10,11,12,14,15, _, _,
|
|
_, 2, 4, 5, 6, 7, 8, 9,10,11,12,14,15, _, _, _,
|
|
_, 1, 2, 4, 5, 6, 7, 8, 9,10,11,12,14,15, _, _,
|
|
_, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,14,15, _, _,
|
|
_, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,14,15, _,
|
|
_, 2, 4, 6, 8,10,12,13,14,15, _, _, _, _, _, _,
|
|
_, 1, 2, 4, 6, 8,10,12,13,14,15, _, _, _, _, _,
|
|
_, 2, 3, 4, 6, 8,10,12,13,14,15, _, _, _, _, _,
|
|
_, 1, 2, 3, 4, 6, 8,10,12,13,14,15, _, _, _, _,
|
|
_, 2, 4, 5, 6, 8,10,12,13,14,15, _, _, _, _, _,
|
|
_, 1, 2, 4, 5, 6, 8,10,12,13,14,15, _, _, _, _,
|
|
_, 2, 3, 4, 5, 6, 8,10,12,13,14,15, _, _, _, _,
|
|
_, 1, 2, 3, 4, 5, 6, 8,10,12,13,14,15, _, _, _,
|
|
_, 2, 4, 6, 7, 8,10,12,13,14,15, _, _, _, _, _,
|
|
_, 1, 2, 4, 6, 7, 8,10,12,13,14,15, _, _, _, _,
|
|
_, 2, 3, 4, 6, 7, 8,10,12,13,14,15, _, _, _, _,
|
|
_, 1, 2, 3, 4, 6, 7, 8,10,12,13,14,15, _, _, _,
|
|
_, 2, 4, 5, 6, 7, 8,10,12,13,14,15, _, _, _, _,
|
|
_, 1, 2, 4, 5, 6, 7, 8,10,12,13,14,15, _, _, _,
|
|
_, 2, 3, 4, 5, 6, 7, 8,10,12,13,14,15, _, _, _,
|
|
_, 1, 2, 3, 4, 5, 6, 7, 8,10,12,13,14,15, _, _,
|
|
_, 2, 4, 6, 8, 9,10,12,13,14,15, _, _, _, _, _,
|
|
_, 1, 2, 4, 6, 8, 9,10,12,13,14,15, _, _, _, _,
|
|
_, 2, 3, 4, 6, 8, 9,10,12,13,14,15, _, _, _, _,
|
|
_, 1, 2, 3, 4, 6, 8, 9,10,12,13,14,15, _, _, _,
|
|
_, 2, 4, 5, 6, 8, 9,10,12,13,14,15, _, _, _, _,
|
|
_, 1, 2, 4, 5, 6, 8, 9,10,12,13,14,15, _, _, _,
|
|
_, 2, 3, 4, 5, 6, 8, 9,10,12,13,14,15, _, _, _,
|
|
_, 1, 2, 3, 4, 5, 6, 8, 9,10,12,13,14,15, _, _,
|
|
_, 2, 4, 6, 7, 8, 9,10,12,13,14,15, _, _, _, _,
|
|
_, 1, 2, 4, 6, 7, 8, 9,10,12,13,14,15, _, _, _,
|
|
_, 2, 3, 4, 6, 7, 8, 9,10,12,13,14,15, _, _, _,
|
|
_, 1, 2, 3, 4, 6, 7, 8, 9,10,12,13,14,15, _, _,
|
|
_, 2, 4, 5, 6, 7, 8, 9,10,12,13,14,15, _, _, _,
|
|
_, 1, 2, 4, 5, 6, 7, 8, 9,10,12,13,14,15, _, _,
|
|
_, 2, 3, 4, 5, 6, 7, 8, 9,10,12,13,14,15, _, _,
|
|
_, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,12,13,14,15, _,
|
|
_, 2, 4, 6, 8,10,11,12,13,14,15, _, _, _, _, _,
|
|
_, 1, 2, 4, 6, 8,10,11,12,13,14,15, _, _, _, _,
|
|
_, 2, 3, 4, 6, 8,10,11,12,13,14,15, _, _, _, _,
|
|
_, 1, 2, 3, 4, 6, 8,10,11,12,13,14,15, _, _, _,
|
|
_, 2, 4, 5, 6, 8,10,11,12,13,14,15, _, _, _, _,
|
|
_, 1, 2, 4, 5, 6, 8,10,11,12,13,14,15, _, _, _,
|
|
_, 2, 3, 4, 5, 6, 8,10,11,12,13,14,15, _, _, _,
|
|
_, 1, 2, 3, 4, 5, 6, 8,10,11,12,13,14,15, _, _,
|
|
_, 2, 4, 6, 7, 8,10,11,12,13,14,15, _, _, _, _,
|
|
_, 1, 2, 4, 6, 7, 8,10,11,12,13,14,15, _, _, _,
|
|
_, 2, 3, 4, 6, 7, 8,10,11,12,13,14,15, _, _, _,
|
|
_, 1, 2, 3, 4, 6, 7, 8,10,11,12,13,14,15, _, _,
|
|
_, 2, 4, 5, 6, 7, 8,10,11,12,13,14,15, _, _, _,
|
|
_, 1, 2, 4, 5, 6, 7, 8,10,11,12,13,14,15, _, _,
|
|
_, 2, 3, 4, 5, 6, 7, 8,10,11,12,13,14,15, _, _,
|
|
_, 1, 2, 3, 4, 5, 6, 7, 8,10,11,12,13,14,15, _,
|
|
_, 2, 4, 6, 8, 9,10,11,12,13,14,15, _, _, _, _,
|
|
_, 1, 2, 4, 6, 8, 9,10,11,12,13,14,15, _, _, _,
|
|
_, 2, 3, 4, 6, 8, 9,10,11,12,13,14,15, _, _, _,
|
|
_, 1, 2, 3, 4, 6, 8, 9,10,11,12,13,14,15, _, _,
|
|
_, 2, 4, 5, 6, 8, 9,10,11,12,13,14,15, _, _, _,
|
|
_, 1, 2, 4, 5, 6, 8, 9,10,11,12,13,14,15, _, _,
|
|
_, 2, 3, 4, 5, 6, 8, 9,10,11,12,13,14,15, _, _,
|
|
_, 1, 2, 3, 4, 5, 6, 8, 9,10,11,12,13,14,15, _,
|
|
_, 2, 4, 6, 7, 8, 9,10,11,12,13,14,15, _, _, _,
|
|
_, 1, 2, 4, 6, 7, 8, 9,10,11,12,13,14,15, _, _,
|
|
_, 2, 3, 4, 6, 7, 8, 9,10,11,12,13,14,15, _, _,
|
|
_, 1, 2, 3, 4, 6, 7, 8, 9,10,11,12,13,14,15, _,
|
|
_, 2, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15, _, _,
|
|
_, 1, 2, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15, _,
|
|
_, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15, _,
|
|
_, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15
|
|
};
|
|
#undef _
|
|
|
|
#define _ 0xff
|
|
#define SVD32(_m_,_i_) _mm_loadu_si128(svd32[(uint8_t)(_m_>>(_i_<<3))])
|
|
static const ALIGNED(unsigned char, svd32[256][16],16) = {
|
|
{ 0, _, _, _, 1, _, _, _, 2, _, _, _, 3, _, _, _ },
|
|
{ 0, 1, _, _, 2, _, _, _, 3, _, _, _, 4, _, _, _ },
|
|
{ 0, 1, 2, _, 3, _, _, _, 4, _, _, _, 5, _, _, _ },
|
|
{ 0, 1, 2, 3, 4, _, _, _, 5, _, _, _, 6, _, _, _ },
|
|
{ 0, _, _, _, 1, 2, _, _, 3, _, _, _, 4, _, _, _ },
|
|
{ 0, 1, _, _, 2, 3, _, _, 4, _, _, _, 5, _, _, _ },
|
|
{ 0, 1, 2, _, 3, 4, _, _, 5, _, _, _, 6, _, _, _ },
|
|
{ 0, 1, 2, 3, 4, 5, _, _, 6, _, _, _, 7, _, _, _ },
|
|
{ 0, _, _, _, 1, 2, 3, _, 4, _, _, _, 5, _, _, _ },
|
|
{ 0, 1, _, _, 2, 3, 4, _, 5, _, _, _, 6, _, _, _ },
|
|
{ 0, 1, 2, _, 3, 4, 5, _, 6, _, _, _, 7, _, _, _ },
|
|
{ 0, 1, 2, 3, 4, 5, 6, _, 7, _, _, _, 8, _, _, _ },
|
|
{ 0, _, _, _, 1, 2, 3, 4, 5, _, _, _, 6, _, _, _ },
|
|
{ 0, 1, _, _, 2, 3, 4, 5, 6, _, _, _, 7, _, _, _ },
|
|
{ 0, 1, 2, _, 3, 4, 5, 6, 7, _, _, _, 8, _, _, _ },
|
|
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, _, _, _, 9, _, _, _ },
|
|
{ 0, _, _, _, 1, _, _, _, 2, 3, _, _, 4, _, _, _ },
|
|
{ 0, 1, _, _, 2, _, _, _, 3, 4, _, _, 5, _, _, _ },
|
|
{ 0, 1, 2, _, 3, _, _, _, 4, 5, _, _, 6, _, _, _ },
|
|
{ 0, 1, 2, 3, 4, _, _, _, 5, 6, _, _, 7, _, _, _ },
|
|
{ 0, _, _, _, 1, 2, _, _, 3, 4, _, _, 5, _, _, _ },
|
|
{ 0, 1, _, _, 2, 3, _, _, 4, 5, _, _, 6, _, _, _ },
|
|
{ 0, 1, 2, _, 3, 4, _, _, 5, 6, _, _, 7, _, _, _ },
|
|
{ 0, 1, 2, 3, 4, 5, _, _, 6, 7, _, _, 8, _, _, _ },
|
|
{ 0, _, _, _, 1, 2, 3, _, 4, 5, _, _, 6, _, _, _ },
|
|
{ 0, 1, _, _, 2, 3, 4, _, 5, 6, _, _, 7, _, _, _ },
|
|
{ 0, 1, 2, _, 3, 4, 5, _, 6, 7, _, _, 8, _, _, _ },
|
|
{ 0, 1, 2, 3, 4, 5, 6, _, 7, 8, _, _, 9, _, _, _ },
|
|
{ 0, _, _, _, 1, 2, 3, 4, 5, 6, _, _, 7, _, _, _ },
|
|
{ 0, 1, _, _, 2, 3, 4, 5, 6, 7, _, _, 8, _, _, _ },
|
|
{ 0, 1, 2, _, 3, 4, 5, 6, 7, 8, _, _, 9, _, _, _ },
|
|
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, _, _,10, _, _, _ },
|
|
{ 0, _, _, _, 1, _, _, _, 2, 3, 4, _, 5, _, _, _ },
|
|
{ 0, 1, _, _, 2, _, _, _, 3, 4, 5, _, 6, _, _, _ },
|
|
{ 0, 1, 2, _, 3, _, _, _, 4, 5, 6, _, 7, _, _, _ },
|
|
{ 0, 1, 2, 3, 4, _, _, _, 5, 6, 7, _, 8, _, _, _ },
|
|
{ 0, _, _, _, 1, 2, _, _, 3, 4, 5, _, 6, _, _, _ },
|
|
{ 0, 1, _, _, 2, 3, _, _, 4, 5, 6, _, 7, _, _, _ },
|
|
{ 0, 1, 2, _, 3, 4, _, _, 5, 6, 7, _, 8, _, _, _ },
|
|
{ 0, 1, 2, 3, 4, 5, _, _, 6, 7, 8, _, 9, _, _, _ },
|
|
{ 0, _, _, _, 1, 2, 3, _, 4, 5, 6, _, 7, _, _, _ },
|
|
{ 0, 1, _, _, 2, 3, 4, _, 5, 6, 7, _, 8, _, _, _ },
|
|
{ 0, 1, 2, _, 3, 4, 5, _, 6, 7, 8, _, 9, _, _, _ },
|
|
{ 0, 1, 2, 3, 4, 5, 6, _, 7, 8, 9, _,10, _, _, _ },
|
|
{ 0, _, _, _, 1, 2, 3, 4, 5, 6, 7, _, 8, _, _, _ },
|
|
{ 0, 1, _, _, 2, 3, 4, 5, 6, 7, 8, _, 9, _, _, _ },
|
|
{ 0, 1, 2, _, 3, 4, 5, 6, 7, 8, 9, _,10, _, _, _ },
|
|
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, _,11, _, _, _ },
|
|
{ 0, _, _, _, 1, _, _, _, 2, 3, 4, 5, 6, _, _, _ },
|
|
{ 0, 1, _, _, 2, _, _, _, 3, 4, 5, 6, 7, _, _, _ },
|
|
{ 0, 1, 2, _, 3, _, _, _, 4, 5, 6, 7, 8, _, _, _ },
|
|
{ 0, 1, 2, 3, 4, _, _, _, 5, 6, 7, 8, 9, _, _, _ },
|
|
{ 0, _, _, _, 1, 2, _, _, 3, 4, 5, 6, 7, _, _, _ },
|
|
{ 0, 1, _, _, 2, 3, _, _, 4, 5, 6, 7, 8, _, _, _ },
|
|
{ 0, 1, 2, _, 3, 4, _, _, 5, 6, 7, 8, 9, _, _, _ },
|
|
{ 0, 1, 2, 3, 4, 5, _, _, 6, 7, 8, 9,10, _, _, _ },
|
|
{ 0, _, _, _, 1, 2, 3, _, 4, 5, 6, 7, 8, _, _, _ },
|
|
{ 0, 1, _, _, 2, 3, 4, _, 5, 6, 7, 8, 9, _, _, _ },
|
|
{ 0, 1, 2, _, 3, 4, 5, _, 6, 7, 8, 9,10, _, _, _ },
|
|
{ 0, 1, 2, 3, 4, 5, 6, _, 7, 8, 9,10,11, _, _, _ },
|
|
{ 0, _, _, _, 1, 2, 3, 4, 5, 6, 7, 8, 9, _, _, _ },
|
|
{ 0, 1, _, _, 2, 3, 4, 5, 6, 7, 8, 9,10, _, _, _ },
|
|
{ 0, 1, 2, _, 3, 4, 5, 6, 7, 8, 9,10,11, _, _, _ },
|
|
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12, _, _, _ },
|
|
{ 0, _, _, _, 1, _, _, _, 2, _, _, _, 3, 4, _, _ },
|
|
{ 0, 1, _, _, 2, _, _, _, 3, _, _, _, 4, 5, _, _ },
|
|
{ 0, 1, 2, _, 3, _, _, _, 4, _, _, _, 5, 6, _, _ },
|
|
{ 0, 1, 2, 3, 4, _, _, _, 5, _, _, _, 6, 7, _, _ },
|
|
{ 0, _, _, _, 1, 2, _, _, 3, _, _, _, 4, 5, _, _ },
|
|
{ 0, 1, _, _, 2, 3, _, _, 4, _, _, _, 5, 6, _, _ },
|
|
{ 0, 1, 2, _, 3, 4, _, _, 5, _, _, _, 6, 7, _, _ },
|
|
{ 0, 1, 2, 3, 4, 5, _, _, 6, _, _, _, 7, 8, _, _ },
|
|
{ 0, _, _, _, 1, 2, 3, _, 4, _, _, _, 5, 6, _, _ },
|
|
{ 0, 1, _, _, 2, 3, 4, _, 5, _, _, _, 6, 7, _, _ },
|
|
{ 0, 1, 2, _, 3, 4, 5, _, 6, _, _, _, 7, 8, _, _ },
|
|
{ 0, 1, 2, 3, 4, 5, 6, _, 7, _, _, _, 8, 9, _, _ },
|
|
{ 0, _, _, _, 1, 2, 3, 4, 5, _, _, _, 6, 7, _, _ },
|
|
{ 0, 1, _, _, 2, 3, 4, 5, 6, _, _, _, 7, 8, _, _ },
|
|
{ 0, 1, 2, _, 3, 4, 5, 6, 7, _, _, _, 8, 9, _, _ },
|
|
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, _, _, _, 9,10, _, _ },
|
|
{ 0, _, _, _, 1, _, _, _, 2, 3, _, _, 4, 5, _, _ },
|
|
{ 0, 1, _, _, 2, _, _, _, 3, 4, _, _, 5, 6, _, _ },
|
|
{ 0, 1, 2, _, 3, _, _, _, 4, 5, _, _, 6, 7, _, _ },
|
|
{ 0, 1, 2, 3, 4, _, _, _, 5, 6, _, _, 7, 8, _, _ },
|
|
{ 0, _, _, _, 1, 2, _, _, 3, 4, _, _, 5, 6, _, _ },
|
|
{ 0, 1, _, _, 2, 3, _, _, 4, 5, _, _, 6, 7, _, _ },
|
|
{ 0, 1, 2, _, 3, 4, _, _, 5, 6, _, _, 7, 8, _, _ },
|
|
{ 0, 1, 2, 3, 4, 5, _, _, 6, 7, _, _, 8, 9, _, _ },
|
|
{ 0, _, _, _, 1, 2, 3, _, 4, 5, _, _, 6, 7, _, _ },
|
|
{ 0, 1, _, _, 2, 3, 4, _, 5, 6, _, _, 7, 8, _, _ },
|
|
{ 0, 1, 2, _, 3, 4, 5, _, 6, 7, _, _, 8, 9, _, _ },
|
|
{ 0, 1, 2, 3, 4, 5, 6, _, 7, 8, _, _, 9,10, _, _ },
|
|
{ 0, _, _, _, 1, 2, 3, 4, 5, 6, _, _, 7, 8, _, _ },
|
|
{ 0, 1, _, _, 2, 3, 4, 5, 6, 7, _, _, 8, 9, _, _ },
|
|
{ 0, 1, 2, _, 3, 4, 5, 6, 7, 8, _, _, 9,10, _, _ },
|
|
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, _, _,10,11, _, _ },
|
|
{ 0, _, _, _, 1, _, _, _, 2, 3, 4, _, 5, 6, _, _ },
|
|
{ 0, 1, _, _, 2, _, _, _, 3, 4, 5, _, 6, 7, _, _ },
|
|
{ 0, 1, 2, _, 3, _, _, _, 4, 5, 6, _, 7, 8, _, _ },
|
|
{ 0, 1, 2, 3, 4, _, _, _, 5, 6, 7, _, 8, 9, _, _ },
|
|
{ 0, _, _, _, 1, 2, _, _, 3, 4, 5, _, 6, 7, _, _ },
|
|
{ 0, 1, _, _, 2, 3, _, _, 4, 5, 6, _, 7, 8, _, _ },
|
|
{ 0, 1, 2, _, 3, 4, _, _, 5, 6, 7, _, 8, 9, _, _ },
|
|
{ 0, 1, 2, 3, 4, 5, _, _, 6, 7, 8, _, 9,10, _, _ },
|
|
{ 0, _, _, _, 1, 2, 3, _, 4, 5, 6, _, 7, 8, _, _ },
|
|
{ 0, 1, _, _, 2, 3, 4, _, 5, 6, 7, _, 8, 9, _, _ },
|
|
{ 0, 1, 2, _, 3, 4, 5, _, 6, 7, 8, _, 9,10, _, _ },
|
|
{ 0, 1, 2, 3, 4, 5, 6, _, 7, 8, 9, _,10,11, _, _ },
|
|
{ 0, _, _, _, 1, 2, 3, 4, 5, 6, 7, _, 8, 9, _, _ },
|
|
{ 0, 1, _, _, 2, 3, 4, 5, 6, 7, 8, _, 9,10, _, _ },
|
|
{ 0, 1, 2, _, 3, 4, 5, 6, 7, 8, 9, _,10,11, _, _ },
|
|
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, _,11,12, _, _ },
|
|
{ 0, _, _, _, 1, _, _, _, 2, 3, 4, 5, 6, 7, _, _ },
|
|
{ 0, 1, _, _, 2, _, _, _, 3, 4, 5, 6, 7, 8, _, _ },
|
|
{ 0, 1, 2, _, 3, _, _, _, 4, 5, 6, 7, 8, 9, _, _ },
|
|
{ 0, 1, 2, 3, 4, _, _, _, 5, 6, 7, 8, 9,10, _, _ },
|
|
{ 0, _, _, _, 1, 2, _, _, 3, 4, 5, 6, 7, 8, _, _ },
|
|
{ 0, 1, _, _, 2, 3, _, _, 4, 5, 6, 7, 8, 9, _, _ },
|
|
{ 0, 1, 2, _, 3, 4, _, _, 5, 6, 7, 8, 9,10, _, _ },
|
|
{ 0, 1, 2, 3, 4, 5, _, _, 6, 7, 8, 9,10,11, _, _ },
|
|
{ 0, _, _, _, 1, 2, 3, _, 4, 5, 6, 7, 8, 9, _, _ },
|
|
{ 0, 1, _, _, 2, 3, 4, _, 5, 6, 7, 8, 9,10, _, _ },
|
|
{ 0, 1, 2, _, 3, 4, 5, _, 6, 7, 8, 9,10,11, _, _ },
|
|
{ 0, 1, 2, 3, 4, 5, 6, _, 7, 8, 9,10,11,12, _, _ },
|
|
{ 0, _, _, _, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, _, _ },
|
|
{ 0, 1, _, _, 2, 3, 4, 5, 6, 7, 8, 9,10,11, _, _ },
|
|
{ 0, 1, 2, _, 3, 4, 5, 6, 7, 8, 9,10,11,12, _, _ },
|
|
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13, _, _ },
|
|
{ 0, _, _, _, 1, _, _, _, 2, _, _, _, 3, 4, 5, _ },
|
|
{ 0, 1, _, _, 2, _, _, _, 3, _, _, _, 4, 5, 6, _ },
|
|
{ 0, 1, 2, _, 3, _, _, _, 4, _, _, _, 5, 6, 7, _ },
|
|
{ 0, 1, 2, 3, 4, _, _, _, 5, _, _, _, 6, 7, 8, _ },
|
|
{ 0, _, _, _, 1, 2, _, _, 3, _, _, _, 4, 5, 6, _ },
|
|
{ 0, 1, _, _, 2, 3, _, _, 4, _, _, _, 5, 6, 7, _ },
|
|
{ 0, 1, 2, _, 3, 4, _, _, 5, _, _, _, 6, 7, 8, _ },
|
|
{ 0, 1, 2, 3, 4, 5, _, _, 6, _, _, _, 7, 8, 9, _ },
|
|
{ 0, _, _, _, 1, 2, 3, _, 4, _, _, _, 5, 6, 7, _ },
|
|
{ 0, 1, _, _, 2, 3, 4, _, 5, _, _, _, 6, 7, 8, _ },
|
|
{ 0, 1, 2, _, 3, 4, 5, _, 6, _, _, _, 7, 8, 9, _ },
|
|
{ 0, 1, 2, 3, 4, 5, 6, _, 7, _, _, _, 8, 9,10, _ },
|
|
{ 0, _, _, _, 1, 2, 3, 4, 5, _, _, _, 6, 7, 8, _ },
|
|
{ 0, 1, _, _, 2, 3, 4, 5, 6, _, _, _, 7, 8, 9, _ },
|
|
{ 0, 1, 2, _, 3, 4, 5, 6, 7, _, _, _, 8, 9,10, _ },
|
|
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, _, _, _, 9,10,11, _ },
|
|
{ 0, _, _, _, 1, _, _, _, 2, 3, _, _, 4, 5, 6, _ },
|
|
{ 0, 1, _, _, 2, _, _, _, 3, 4, _, _, 5, 6, 7, _ },
|
|
{ 0, 1, 2, _, 3, _, _, _, 4, 5, _, _, 6, 7, 8, _ },
|
|
{ 0, 1, 2, 3, 4, _, _, _, 5, 6, _, _, 7, 8, 9, _ },
|
|
{ 0, _, _, _, 1, 2, _, _, 3, 4, _, _, 5, 6, 7, _ },
|
|
{ 0, 1, _, _, 2, 3, _, _, 4, 5, _, _, 6, 7, 8, _ },
|
|
{ 0, 1, 2, _, 3, 4, _, _, 5, 6, _, _, 7, 8, 9, _ },
|
|
{ 0, 1, 2, 3, 4, 5, _, _, 6, 7, _, _, 8, 9,10, _ },
|
|
{ 0, _, _, _, 1, 2, 3, _, 4, 5, _, _, 6, 7, 8, _ },
|
|
{ 0, 1, _, _, 2, 3, 4, _, 5, 6, _, _, 7, 8, 9, _ },
|
|
{ 0, 1, 2, _, 3, 4, 5, _, 6, 7, _, _, 8, 9,10, _ },
|
|
{ 0, 1, 2, 3, 4, 5, 6, _, 7, 8, _, _, 9,10,11, _ },
|
|
{ 0, _, _, _, 1, 2, 3, 4, 5, 6, _, _, 7, 8, 9, _ },
|
|
{ 0, 1, _, _, 2, 3, 4, 5, 6, 7, _, _, 8, 9,10, _ },
|
|
{ 0, 1, 2, _, 3, 4, 5, 6, 7, 8, _, _, 9,10,11, _ },
|
|
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, _, _,10,11,12, _ },
|
|
{ 0, _, _, _, 1, _, _, _, 2, 3, 4, _, 5, 6, 7, _ },
|
|
{ 0, 1, _, _, 2, _, _, _, 3, 4, 5, _, 6, 7, 8, _ },
|
|
{ 0, 1, 2, _, 3, _, _, _, 4, 5, 6, _, 7, 8, 9, _ },
|
|
{ 0, 1, 2, 3, 4, _, _, _, 5, 6, 7, _, 8, 9,10, _ },
|
|
{ 0, _, _, _, 1, 2, _, _, 3, 4, 5, _, 6, 7, 8, _ },
|
|
{ 0, 1, _, _, 2, 3, _, _, 4, 5, 6, _, 7, 8, 9, _ },
|
|
{ 0, 1, 2, _, 3, 4, _, _, 5, 6, 7, _, 8, 9,10, _ },
|
|
{ 0, 1, 2, 3, 4, 5, _, _, 6, 7, 8, _, 9,10,11, _ },
|
|
{ 0, _, _, _, 1, 2, 3, _, 4, 5, 6, _, 7, 8, 9, _ },
|
|
{ 0, 1, _, _, 2, 3, 4, _, 5, 6, 7, _, 8, 9,10, _ },
|
|
{ 0, 1, 2, _, 3, 4, 5, _, 6, 7, 8, _, 9,10,11, _ },
|
|
{ 0, 1, 2, 3, 4, 5, 6, _, 7, 8, 9, _,10,11,12, _ },
|
|
{ 0, _, _, _, 1, 2, 3, 4, 5, 6, 7, _, 8, 9,10, _ },
|
|
{ 0, 1, _, _, 2, 3, 4, 5, 6, 7, 8, _, 9,10,11, _ },
|
|
{ 0, 1, 2, _, 3, 4, 5, 6, 7, 8, 9, _,10,11,12, _ },
|
|
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, _,11,12,13, _ },
|
|
{ 0, _, _, _, 1, _, _, _, 2, 3, 4, 5, 6, 7, 8, _ },
|
|
{ 0, 1, _, _, 2, _, _, _, 3, 4, 5, 6, 7, 8, 9, _ },
|
|
{ 0, 1, 2, _, 3, _, _, _, 4, 5, 6, 7, 8, 9,10, _ },
|
|
{ 0, 1, 2, 3, 4, _, _, _, 5, 6, 7, 8, 9,10,11, _ },
|
|
{ 0, _, _, _, 1, 2, _, _, 3, 4, 5, 6, 7, 8, 9, _ },
|
|
{ 0, 1, _, _, 2, 3, _, _, 4, 5, 6, 7, 8, 9,10, _ },
|
|
{ 0, 1, 2, _, 3, 4, _, _, 5, 6, 7, 8, 9,10,11, _ },
|
|
{ 0, 1, 2, 3, 4, 5, _, _, 6, 7, 8, 9,10,11,12, _ },
|
|
{ 0, _, _, _, 1, 2, 3, _, 4, 5, 6, 7, 8, 9,10, _ },
|
|
{ 0, 1, _, _, 2, 3, 4, _, 5, 6, 7, 8, 9,10,11, _ },
|
|
{ 0, 1, 2, _, 3, 4, 5, _, 6, 7, 8, 9,10,11,12, _ },
|
|
{ 0, 1, 2, 3, 4, 5, 6, _, 7, 8, 9,10,11,12,13, _ },
|
|
{ 0, _, _, _, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11, _ },
|
|
{ 0, 1, _, _, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12, _ },
|
|
{ 0, 1, 2, _, 3, 4, 5, 6, 7, 8, 9,10,11,12,13, _ },
|
|
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14, _ },
|
|
{ 0, _, _, _, 1, _, _, _, 2, _, _, _, 3, 4, 5, 6 },
|
|
{ 0, 1, _, _, 2, _, _, _, 3, _, _, _, 4, 5, 6, 7 },
|
|
{ 0, 1, 2, _, 3, _, _, _, 4, _, _, _, 5, 6, 7, 8 },
|
|
{ 0, 1, 2, 3, 4, _, _, _, 5, _, _, _, 6, 7, 8, 9 },
|
|
{ 0, _, _, _, 1, 2, _, _, 3, _, _, _, 4, 5, 6, 7 },
|
|
{ 0, 1, _, _, 2, 3, _, _, 4, _, _, _, 5, 6, 7, 8 },
|
|
{ 0, 1, 2, _, 3, 4, _, _, 5, _, _, _, 6, 7, 8, 9 },
|
|
{ 0, 1, 2, 3, 4, 5, _, _, 6, _, _, _, 7, 8, 9,10 },
|
|
{ 0, _, _, _, 1, 2, 3, _, 4, _, _, _, 5, 6, 7, 8 },
|
|
{ 0, 1, _, _, 2, 3, 4, _, 5, _, _, _, 6, 7, 8, 9 },
|
|
{ 0, 1, 2, _, 3, 4, 5, _, 6, _, _, _, 7, 8, 9,10 },
|
|
{ 0, 1, 2, 3, 4, 5, 6, _, 7, _, _, _, 8, 9,10,11 },
|
|
{ 0, _, _, _, 1, 2, 3, 4, 5, _, _, _, 6, 7, 8, 9 },
|
|
{ 0, 1, _, _, 2, 3, 4, 5, 6, _, _, _, 7, 8, 9,10 },
|
|
{ 0, 1, 2, _, 3, 4, 5, 6, 7, _, _, _, 8, 9,10,11 },
|
|
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, _, _, _, 9,10,11,12 },
|
|
{ 0, _, _, _, 1, _, _, _, 2, 3, _, _, 4, 5, 6, 7 },
|
|
{ 0, 1, _, _, 2, _, _, _, 3, 4, _, _, 5, 6, 7, 8 },
|
|
{ 0, 1, 2, _, 3, _, _, _, 4, 5, _, _, 6, 7, 8, 9 },
|
|
{ 0, 1, 2, 3, 4, _, _, _, 5, 6, _, _, 7, 8, 9,10 },
|
|
{ 0, _, _, _, 1, 2, _, _, 3, 4, _, _, 5, 6, 7, 8 },
|
|
{ 0, 1, _, _, 2, 3, _, _, 4, 5, _, _, 6, 7, 8, 9 },
|
|
{ 0, 1, 2, _, 3, 4, _, _, 5, 6, _, _, 7, 8, 9,10 },
|
|
{ 0, 1, 2, 3, 4, 5, _, _, 6, 7, _, _, 8, 9,10,11 },
|
|
{ 0, _, _, _, 1, 2, 3, _, 4, 5, _, _, 6, 7, 8, 9 },
|
|
{ 0, 1, _, _, 2, 3, 4, _, 5, 6, _, _, 7, 8, 9,10 },
|
|
{ 0, 1, 2, _, 3, 4, 5, _, 6, 7, _, _, 8, 9,10,11 },
|
|
{ 0, 1, 2, 3, 4, 5, 6, _, 7, 8, _, _, 9,10,11,12 },
|
|
{ 0, _, _, _, 1, 2, 3, 4, 5, 6, _, _, 7, 8, 9,10 },
|
|
{ 0, 1, _, _, 2, 3, 4, 5, 6, 7, _, _, 8, 9,10,11 },
|
|
{ 0, 1, 2, _, 3, 4, 5, 6, 7, 8, _, _, 9,10,11,12 },
|
|
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, _, _,10,11,12,13 },
|
|
{ 0, _, _, _, 1, _, _, _, 2, 3, 4, _, 5, 6, 7, 8 },
|
|
{ 0, 1, _, _, 2, _, _, _, 3, 4, 5, _, 6, 7, 8, 9 },
|
|
{ 0, 1, 2, _, 3, _, _, _, 4, 5, 6, _, 7, 8, 9,10 },
|
|
{ 0, 1, 2, 3, 4, _, _, _, 5, 6, 7, _, 8, 9,10,11 },
|
|
{ 0, _, _, _, 1, 2, _, _, 3, 4, 5, _, 6, 7, 8, 9 },
|
|
{ 0, 1, _, _, 2, 3, _, _, 4, 5, 6, _, 7, 8, 9,10 },
|
|
{ 0, 1, 2, _, 3, 4, _, _, 5, 6, 7, _, 8, 9,10,11 },
|
|
{ 0, 1, 2, 3, 4, 5, _, _, 6, 7, 8, _, 9,10,11,12 },
|
|
{ 0, _, _, _, 1, 2, 3, _, 4, 5, 6, _, 7, 8, 9,10 },
|
|
{ 0, 1, _, _, 2, 3, 4, _, 5, 6, 7, _, 8, 9,10,11 },
|
|
{ 0, 1, 2, _, 3, 4, 5, _, 6, 7, 8, _, 9,10,11,12 },
|
|
{ 0, 1, 2, 3, 4, 5, 6, _, 7, 8, 9, _,10,11,12,13 },
|
|
{ 0, _, _, _, 1, 2, 3, 4, 5, 6, 7, _, 8, 9,10,11 },
|
|
{ 0, 1, _, _, 2, 3, 4, 5, 6, 7, 8, _, 9,10,11,12 },
|
|
{ 0, 1, 2, _, 3, 4, 5, 6, 7, 8, 9, _,10,11,12,13 },
|
|
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, _,11,12,13,14 },
|
|
{ 0, _, _, _, 1, _, _, _, 2, 3, 4, 5, 6, 7, 8, 9 },
|
|
{ 0, 1, _, _, 2, _, _, _, 3, 4, 5, 6, 7, 8, 9,10 },
|
|
{ 0, 1, 2, _, 3, _, _, _, 4, 5, 6, 7, 8, 9,10,11 },
|
|
{ 0, 1, 2, 3, 4, _, _, _, 5, 6, 7, 8, 9,10,11,12 },
|
|
{ 0, _, _, _, 1, 2, _, _, 3, 4, 5, 6, 7, 8, 9,10 },
|
|
{ 0, 1, _, _, 2, 3, _, _, 4, 5, 6, 7, 8, 9,10,11 },
|
|
{ 0, 1, 2, _, 3, 4, _, _, 5, 6, 7, 8, 9,10,11,12 },
|
|
{ 0, 1, 2, 3, 4, 5, _, _, 6, 7, 8, 9,10,11,12,13 },
|
|
{ 0, _, _, _, 1, 2, 3, _, 4, 5, 6, 7, 8, 9,10,11 },
|
|
{ 0, 1, _, _, 2, 3, 4, _, 5, 6, 7, 8, 9,10,11,12 },
|
|
{ 0, 1, 2, _, 3, 4, 5, _, 6, 7, 8, 9,10,11,12,13 },
|
|
{ 0, 1, 2, 3, 4, 5, 6, _, 7, 8, 9,10,11,12,13,14 },
|
|
{ 0, _, _, _, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12 },
|
|
{ 0, 1, _, _, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13 },
|
|
{ 0, 1, 2, _, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14 },
|
|
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15 }
|
|
};
|
|
|
|
#define SVD16(_m_,_i_) _mm_loadu_si128(svd16[(uint8_t)(_m_>>(_i_<<3))])
|
|
static const ALIGNED(unsigned char, svd16[256][16],16) = {
|
|
{ 0, _, 1, _, 2, _, 3, _, 4, _, 5, _, 6, _, 7, _},
|
|
{ 0, 1, 2, _, 3, _, 4, _, 5, _, 6, _, 7, _, 8, _},
|
|
{ 0, _, 1, 2, 3, _, 4, _, 5, _, 6, _, 7, _, 8, _},
|
|
{ 0, 1, 2, 3, 4, _, 5, _, 6, _, 7, _, 8, _, 9, _},
|
|
{ 0, _, 1, _, 2, 3, 4, _, 5, _, 6, _, 7, _, 8, _},
|
|
{ 0, 1, 2, _, 3, 4, 5, _, 6, _, 7, _, 8, _, 9, _},
|
|
{ 0, _, 1, 2, 3, 4, 5, _, 6, _, 7, _, 8, _, 9, _},
|
|
{ 0, 1, 2, 3, 4, 5, 6, _, 7, _, 8, _, 9, _,10, _},
|
|
{ 0, _, 1, _, 2, _, 3, 4, 5, _, 6, _, 7, _, 8, _},
|
|
{ 0, 1, 2, _, 3, _, 4, 5, 6, _, 7, _, 8, _, 9, _},
|
|
{ 0, _, 1, 2, 3, _, 4, 5, 6, _, 7, _, 8, _, 9, _},
|
|
{ 0, 1, 2, 3, 4, _, 5, 6, 7, _, 8, _, 9, _,10, _},
|
|
{ 0, _, 1, _, 2, 3, 4, 5, 6, _, 7, _, 8, _, 9, _},
|
|
{ 0, 1, 2, _, 3, 4, 5, 6, 7, _, 8, _, 9, _,10, _},
|
|
{ 0, _, 1, 2, 3, 4, 5, 6, 7, _, 8, _, 9, _,10, _},
|
|
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, _, 9, _,10, _,11, _},
|
|
{ 0, _, 1, _, 2, _, 3, _, 4, 5, 6, _, 7, _, 8, _},
|
|
{ 0, 1, 2, _, 3, _, 4, _, 5, 6, 7, _, 8, _, 9, _},
|
|
{ 0, _, 1, 2, 3, _, 4, _, 5, 6, 7, _, 8, _, 9, _},
|
|
{ 0, 1, 2, 3, 4, _, 5, _, 6, 7, 8, _, 9, _,10, _},
|
|
{ 0, _, 1, _, 2, 3, 4, _, 5, 6, 7, _, 8, _, 9, _},
|
|
{ 0, 1, 2, _, 3, 4, 5, _, 6, 7, 8, _, 9, _,10, _},
|
|
{ 0, _, 1, 2, 3, 4, 5, _, 6, 7, 8, _, 9, _,10, _},
|
|
{ 0, 1, 2, 3, 4, 5, 6, _, 7, 8, 9, _,10, _,11, _},
|
|
{ 0, _, 1, _, 2, _, 3, 4, 5, 6, 7, _, 8, _, 9, _},
|
|
{ 0, 1, 2, _, 3, _, 4, 5, 6, 7, 8, _, 9, _,10, _},
|
|
{ 0, _, 1, 2, 3, _, 4, 5, 6, 7, 8, _, 9, _,10, _},
|
|
{ 0, 1, 2, 3, 4, _, 5, 6, 7, 8, 9, _,10, _,11, _},
|
|
{ 0, _, 1, _, 2, 3, 4, 5, 6, 7, 8, _, 9, _,10, _},
|
|
{ 0, 1, 2, _, 3, 4, 5, 6, 7, 8, 9, _,10, _,11, _},
|
|
{ 0, _, 1, 2, 3, 4, 5, 6, 7, 8, 9, _,10, _,11, _},
|
|
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, _,11, _,12, _},
|
|
{ 0, _, 1, _, 2, _, 3, _, 4, _, 5, 6, 7, _, 8, _},
|
|
{ 0, 1, 2, _, 3, _, 4, _, 5, _, 6, 7, 8, _, 9, _},
|
|
{ 0, _, 1, 2, 3, _, 4, _, 5, _, 6, 7, 8, _, 9, _},
|
|
{ 0, 1, 2, 3, 4, _, 5, _, 6, _, 7, 8, 9, _,10, _},
|
|
{ 0, _, 1, _, 2, 3, 4, _, 5, _, 6, 7, 8, _, 9, _},
|
|
{ 0, 1, 2, _, 3, 4, 5, _, 6, _, 7, 8, 9, _,10, _},
|
|
{ 0, _, 1, 2, 3, 4, 5, _, 6, _, 7, 8, 9, _,10, _},
|
|
{ 0, 1, 2, 3, 4, 5, 6, _, 7, _, 8, 9,10, _,11, _},
|
|
{ 0, _, 1, _, 2, _, 3, 4, 5, _, 6, 7, 8, _, 9, _},
|
|
{ 0, 1, 2, _, 3, _, 4, 5, 6, _, 7, 8, 9, _,10, _},
|
|
{ 0, _, 1, 2, 3, _, 4, 5, 6, _, 7, 8, 9, _,10, _},
|
|
{ 0, 1, 2, 3, 4, _, 5, 6, 7, _, 8, 9,10, _,11, _},
|
|
{ 0, _, 1, _, 2, 3, 4, 5, 6, _, 7, 8, 9, _,10, _},
|
|
{ 0, 1, 2, _, 3, 4, 5, 6, 7, _, 8, 9,10, _,11, _},
|
|
{ 0, _, 1, 2, 3, 4, 5, 6, 7, _, 8, 9,10, _,11, _},
|
|
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, _, 9,10,11, _,12, _},
|
|
{ 0, _, 1, _, 2, _, 3, _, 4, 5, 6, 7, 8, _, 9, _},
|
|
{ 0, 1, 2, _, 3, _, 4, _, 5, 6, 7, 8, 9, _,10, _},
|
|
{ 0, _, 1, 2, 3, _, 4, _, 5, 6, 7, 8, 9, _,10, _},
|
|
{ 0, 1, 2, 3, 4, _, 5, _, 6, 7, 8, 9,10, _,11, _},
|
|
{ 0, _, 1, _, 2, 3, 4, _, 5, 6, 7, 8, 9, _,10, _},
|
|
{ 0, 1, 2, _, 3, 4, 5, _, 6, 7, 8, 9,10, _,11, _},
|
|
{ 0, _, 1, 2, 3, 4, 5, _, 6, 7, 8, 9,10, _,11, _},
|
|
{ 0, 1, 2, 3, 4, 5, 6, _, 7, 8, 9,10,11, _,12, _},
|
|
{ 0, _, 1, _, 2, _, 3, 4, 5, 6, 7, 8, 9, _,10, _},
|
|
{ 0, 1, 2, _, 3, _, 4, 5, 6, 7, 8, 9,10, _,11, _},
|
|
{ 0, _, 1, 2, 3, _, 4, 5, 6, 7, 8, 9,10, _,11, _},
|
|
{ 0, 1, 2, 3, 4, _, 5, 6, 7, 8, 9,10,11, _,12, _},
|
|
{ 0, _, 1, _, 2, 3, 4, 5, 6, 7, 8, 9,10, _,11, _},
|
|
{ 0, 1, 2, _, 3, 4, 5, 6, 7, 8, 9,10,11, _,12, _},
|
|
{ 0, _, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11, _,12, _},
|
|
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12, _,13, _},
|
|
{ 0, _, 1, _, 2, _, 3, _, 4, _, 5, _, 6, 7, 8, _},
|
|
{ 0, 1, 2, _, 3, _, 4, _, 5, _, 6, _, 7, 8, 9, _},
|
|
{ 0, _, 1, 2, 3, _, 4, _, 5, _, 6, _, 7, 8, 9, _},
|
|
{ 0, 1, 2, 3, 4, _, 5, _, 6, _, 7, _, 8, 9,10, _},
|
|
{ 0, _, 1, _, 2, 3, 4, _, 5, _, 6, _, 7, 8, 9, _},
|
|
{ 0, 1, 2, _, 3, 4, 5, _, 6, _, 7, _, 8, 9,10, _},
|
|
{ 0, _, 1, 2, 3, 4, 5, _, 6, _, 7, _, 8, 9,10, _},
|
|
{ 0, 1, 2, 3, 4, 5, 6, _, 7, _, 8, _, 9,10,11, _},
|
|
{ 0, _, 1, _, 2, _, 3, 4, 5, _, 6, _, 7, 8, 9, _},
|
|
{ 0, 1, 2, _, 3, _, 4, 5, 6, _, 7, _, 8, 9,10, _},
|
|
{ 0, _, 1, 2, 3, _, 4, 5, 6, _, 7, _, 8, 9,10, _},
|
|
{ 0, 1, 2, 3, 4, _, 5, 6, 7, _, 8, _, 9,10,11, _},
|
|
{ 0, _, 1, _, 2, 3, 4, 5, 6, _, 7, _, 8, 9,10, _},
|
|
{ 0, 1, 2, _, 3, 4, 5, 6, 7, _, 8, _, 9,10,11, _},
|
|
{ 0, _, 1, 2, 3, 4, 5, 6, 7, _, 8, _, 9,10,11, _},
|
|
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, _, 9, _,10,11,12, _},
|
|
{ 0, _, 1, _, 2, _, 3, _, 4, 5, 6, _, 7, 8, 9, _},
|
|
{ 0, 1, 2, _, 3, _, 4, _, 5, 6, 7, _, 8, 9,10, _},
|
|
{ 0, _, 1, 2, 3, _, 4, _, 5, 6, 7, _, 8, 9,10, _},
|
|
{ 0, 1, 2, 3, 4, _, 5, _, 6, 7, 8, _, 9,10,11, _},
|
|
{ 0, _, 1, _, 2, 3, 4, _, 5, 6, 7, _, 8, 9,10, _},
|
|
{ 0, 1, 2, _, 3, 4, 5, _, 6, 7, 8, _, 9,10,11, _},
|
|
{ 0, _, 1, 2, 3, 4, 5, _, 6, 7, 8, _, 9,10,11, _},
|
|
{ 0, 1, 2, 3, 4, 5, 6, _, 7, 8, 9, _,10,11,12, _},
|
|
{ 0, _, 1, _, 2, _, 3, 4, 5, 6, 7, _, 8, 9,10, _},
|
|
{ 0, 1, 2, _, 3, _, 4, 5, 6, 7, 8, _, 9,10,11, _},
|
|
{ 0, _, 1, 2, 3, _, 4, 5, 6, 7, 8, _, 9,10,11, _},
|
|
{ 0, 1, 2, 3, 4, _, 5, 6, 7, 8, 9, _,10,11,12, _},
|
|
{ 0, _, 1, _, 2, 3, 4, 5, 6, 7, 8, _, 9,10,11, _},
|
|
{ 0, 1, 2, _, 3, 4, 5, 6, 7, 8, 9, _,10,11,12, _},
|
|
{ 0, _, 1, 2, 3, 4, 5, 6, 7, 8, 9, _,10,11,12, _},
|
|
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, _,11,12,13, _},
|
|
{ 0, _, 1, _, 2, _, 3, _, 4, _, 5, 6, 7, 8, 9, _},
|
|
{ 0, 1, 2, _, 3, _, 4, _, 5, _, 6, 7, 8, 9,10, _},
|
|
{ 0, _, 1, 2, 3, _, 4, _, 5, _, 6, 7, 8, 9,10, _},
|
|
{ 0, 1, 2, 3, 4, _, 5, _, 6, _, 7, 8, 9,10,11, _},
|
|
{ 0, _, 1, _, 2, 3, 4, _, 5, _, 6, 7, 8, 9,10, _},
|
|
{ 0, 1, 2, _, 3, 4, 5, _, 6, _, 7, 8, 9,10,11, _},
|
|
{ 0, _, 1, 2, 3, 4, 5, _, 6, _, 7, 8, 9,10,11, _},
|
|
{ 0, 1, 2, 3, 4, 5, 6, _, 7, _, 8, 9,10,11,12, _},
|
|
{ 0, _, 1, _, 2, _, 3, 4, 5, _, 6, 7, 8, 9,10, _},
|
|
{ 0, 1, 2, _, 3, _, 4, 5, 6, _, 7, 8, 9,10,11, _},
|
|
{ 0, _, 1, 2, 3, _, 4, 5, 6, _, 7, 8, 9,10,11, _},
|
|
{ 0, 1, 2, 3, 4, _, 5, 6, 7, _, 8, 9,10,11,12, _},
|
|
{ 0, _, 1, _, 2, 3, 4, 5, 6, _, 7, 8, 9,10,11, _},
|
|
{ 0, 1, 2, _, 3, 4, 5, 6, 7, _, 8, 9,10,11,12, _},
|
|
{ 0, _, 1, 2, 3, 4, 5, 6, 7, _, 8, 9,10,11,12, _},
|
|
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, _, 9,10,11,12,13, _},
|
|
{ 0, _, 1, _, 2, _, 3, _, 4, 5, 6, 7, 8, 9,10, _},
|
|
{ 0, 1, 2, _, 3, _, 4, _, 5, 6, 7, 8, 9,10,11, _},
|
|
{ 0, _, 1, 2, 3, _, 4, _, 5, 6, 7, 8, 9,10,11, _},
|
|
{ 0, 1, 2, 3, 4, _, 5, _, 6, 7, 8, 9,10,11,12, _},
|
|
{ 0, _, 1, _, 2, 3, 4, _, 5, 6, 7, 8, 9,10,11, _},
|
|
{ 0, 1, 2, _, 3, 4, 5, _, 6, 7, 8, 9,10,11,12, _},
|
|
{ 0, _, 1, 2, 3, 4, 5, _, 6, 7, 8, 9,10,11,12, _},
|
|
{ 0, 1, 2, 3, 4, 5, 6, _, 7, 8, 9,10,11,12,13, _},
|
|
{ 0, _, 1, _, 2, _, 3, 4, 5, 6, 7, 8, 9,10,11, _},
|
|
{ 0, 1, 2, _, 3, _, 4, 5, 6, 7, 8, 9,10,11,12, _},
|
|
{ 0, _, 1, 2, 3, _, 4, 5, 6, 7, 8, 9,10,11,12, _},
|
|
{ 0, 1, 2, 3, 4, _, 5, 6, 7, 8, 9,10,11,12,13, _},
|
|
{ 0, _, 1, _, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12, _},
|
|
{ 0, 1, 2, _, 3, 4, 5, 6, 7, 8, 9,10,11,12,13, _},
|
|
{ 0, _, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13, _},
|
|
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14, _},
|
|
{ 0, _, 1, _, 2, _, 3, _, 4, _, 5, _, 6, _, 7, 8},
|
|
{ 0, 1, 2, _, 3, _, 4, _, 5, _, 6, _, 7, _, 8, 9},
|
|
{ 0, _, 1, 2, 3, _, 4, _, 5, _, 6, _, 7, _, 8, 9},
|
|
{ 0, 1, 2, 3, 4, _, 5, _, 6, _, 7, _, 8, _, 9,10},
|
|
{ 0, _, 1, _, 2, 3, 4, _, 5, _, 6, _, 7, _, 8, 9},
|
|
{ 0, 1, 2, _, 3, 4, 5, _, 6, _, 7, _, 8, _, 9,10},
|
|
{ 0, _, 1, 2, 3, 4, 5, _, 6, _, 7, _, 8, _, 9,10},
|
|
{ 0, 1, 2, 3, 4, 5, 6, _, 7, _, 8, _, 9, _,10,11},
|
|
{ 0, _, 1, _, 2, _, 3, 4, 5, _, 6, _, 7, _, 8, 9},
|
|
{ 0, 1, 2, _, 3, _, 4, 5, 6, _, 7, _, 8, _, 9,10},
|
|
{ 0, _, 1, 2, 3, _, 4, 5, 6, _, 7, _, 8, _, 9,10},
|
|
{ 0, 1, 2, 3, 4, _, 5, 6, 7, _, 8, _, 9, _,10,11},
|
|
{ 0, _, 1, _, 2, 3, 4, 5, 6, _, 7, _, 8, _, 9,10},
|
|
{ 0, 1, 2, _, 3, 4, 5, 6, 7, _, 8, _, 9, _,10,11},
|
|
{ 0, _, 1, 2, 3, 4, 5, 6, 7, _, 8, _, 9, _,10,11},
|
|
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, _, 9, _,10, _,11,12},
|
|
{ 0, _, 1, _, 2, _, 3, _, 4, 5, 6, _, 7, _, 8, 9},
|
|
{ 0, 1, 2, _, 3, _, 4, _, 5, 6, 7, _, 8, _, 9,10},
|
|
{ 0, _, 1, 2, 3, _, 4, _, 5, 6, 7, _, 8, _, 9,10},
|
|
{ 0, 1, 2, 3, 4, _, 5, _, 6, 7, 8, _, 9, _,10,11},
|
|
{ 0, _, 1, _, 2, 3, 4, _, 5, 6, 7, _, 8, _, 9,10},
|
|
{ 0, 1, 2, _, 3, 4, 5, _, 6, 7, 8, _, 9, _,10,11},
|
|
{ 0, _, 1, 2, 3, 4, 5, _, 6, 7, 8, _, 9, _,10,11},
|
|
{ 0, 1, 2, 3, 4, 5, 6, _, 7, 8, 9, _,10, _,11,12},
|
|
{ 0, _, 1, _, 2, _, 3, 4, 5, 6, 7, _, 8, _, 9,10},
|
|
{ 0, 1, 2, _, 3, _, 4, 5, 6, 7, 8, _, 9, _,10,11},
|
|
{ 0, _, 1, 2, 3, _, 4, 5, 6, 7, 8, _, 9, _,10,11},
|
|
{ 0, 1, 2, 3, 4, _, 5, 6, 7, 8, 9, _,10, _,11,12},
|
|
{ 0, _, 1, _, 2, 3, 4, 5, 6, 7, 8, _, 9, _,10,11},
|
|
{ 0, 1, 2, _, 3, 4, 5, 6, 7, 8, 9, _,10, _,11,12},
|
|
{ 0, _, 1, 2, 3, 4, 5, 6, 7, 8, 9, _,10, _,11,12},
|
|
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, _,11, _,12,13},
|
|
{ 0, _, 1, _, 2, _, 3, _, 4, _, 5, 6, 7, _, 8, 9},
|
|
{ 0, 1, 2, _, 3, _, 4, _, 5, _, 6, 7, 8, _, 9,10},
|
|
{ 0, _, 1, 2, 3, _, 4, _, 5, _, 6, 7, 8, _, 9,10},
|
|
{ 0, 1, 2, 3, 4, _, 5, _, 6, _, 7, 8, 9, _,10,11},
|
|
{ 0, _, 1, _, 2, 3, 4, _, 5, _, 6, 7, 8, _, 9,10},
|
|
{ 0, 1, 2, _, 3, 4, 5, _, 6, _, 7, 8, 9, _,10,11},
|
|
{ 0, _, 1, 2, 3, 4, 5, _, 6, _, 7, 8, 9, _,10,11},
|
|
{ 0, 1, 2, 3, 4, 5, 6, _, 7, _, 8, 9,10, _,11,12},
|
|
{ 0, _, 1, _, 2, _, 3, 4, 5, _, 6, 7, 8, _, 9,10},
|
|
{ 0, 1, 2, _, 3, _, 4, 5, 6, _, 7, 8, 9, _,10,11},
|
|
{ 0, _, 1, 2, 3, _, 4, 5, 6, _, 7, 8, 9, _,10,11},
|
|
{ 0, 1, 2, 3, 4, _, 5, 6, 7, _, 8, 9,10, _,11,12},
|
|
{ 0, _, 1, _, 2, 3, 4, 5, 6, _, 7, 8, 9, _,10,11},
|
|
{ 0, 1, 2, _, 3, 4, 5, 6, 7, _, 8, 9,10, _,11,12},
|
|
{ 0, _, 1, 2, 3, 4, 5, 6, 7, _, 8, 9,10, _,11,12},
|
|
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, _, 9,10,11, _,12,13},
|
|
{ 0, _, 1, _, 2, _, 3, _, 4, 5, 6, 7, 8, _, 9,10},
|
|
{ 0, 1, 2, _, 3, _, 4, _, 5, 6, 7, 8, 9, _,10,11},
|
|
{ 0, _, 1, 2, 3, _, 4, _, 5, 6, 7, 8, 9, _,10,11},
|
|
{ 0, 1, 2, 3, 4, _, 5, _, 6, 7, 8, 9,10, _,11,12},
|
|
{ 0, _, 1, _, 2, 3, 4, _, 5, 6, 7, 8, 9, _,10,11},
|
|
{ 0, 1, 2, _, 3, 4, 5, _, 6, 7, 8, 9,10, _,11,12},
|
|
{ 0, _, 1, 2, 3, 4, 5, _, 6, 7, 8, 9,10, _,11,12},
|
|
{ 0, 1, 2, 3, 4, 5, 6, _, 7, 8, 9,10,11, _,12,13},
|
|
{ 0, _, 1, _, 2, _, 3, 4, 5, 6, 7, 8, 9, _,10,11},
|
|
{ 0, 1, 2, _, 3, _, 4, 5, 6, 7, 8, 9,10, _,11,12},
|
|
{ 0, _, 1, 2, 3, _, 4, 5, 6, 7, 8, 9,10, _,11,12},
|
|
{ 0, 1, 2, 3, 4, _, 5, 6, 7, 8, 9,10,11, _,12,13},
|
|
{ 0, _, 1, _, 2, 3, 4, 5, 6, 7, 8, 9,10, _,11,12},
|
|
{ 0, 1, 2, _, 3, 4, 5, 6, 7, 8, 9,10,11, _,12,13},
|
|
{ 0, _, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11, _,12,13},
|
|
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12, _,13,14},
|
|
{ 0, _, 1, _, 2, _, 3, _, 4, _, 5, _, 6, 7, 8, 9},
|
|
{ 0, 1, 2, _, 3, _, 4, _, 5, _, 6, _, 7, 8, 9,10},
|
|
{ 0, _, 1, 2, 3, _, 4, _, 5, _, 6, _, 7, 8, 9,10},
|
|
{ 0, 1, 2, 3, 4, _, 5, _, 6, _, 7, _, 8, 9,10,11},
|
|
{ 0, _, 1, _, 2, 3, 4, _, 5, _, 6, _, 7, 8, 9,10},
|
|
{ 0, 1, 2, _, 3, 4, 5, _, 6, _, 7, _, 8, 9,10,11},
|
|
{ 0, _, 1, 2, 3, 4, 5, _, 6, _, 7, _, 8, 9,10,11},
|
|
{ 0, 1, 2, 3, 4, 5, 6, _, 7, _, 8, _, 9,10,11,12},
|
|
{ 0, _, 1, _, 2, _, 3, 4, 5, _, 6, _, 7, 8, 9,10},
|
|
{ 0, 1, 2, _, 3, _, 4, 5, 6, _, 7, _, 8, 9,10,11},
|
|
{ 0, _, 1, 2, 3, _, 4, 5, 6, _, 7, _, 8, 9,10,11},
|
|
{ 0, 1, 2, 3, 4, _, 5, 6, 7, _, 8, _, 9,10,11,12},
|
|
{ 0, _, 1, _, 2, 3, 4, 5, 6, _, 7, _, 8, 9,10,11},
|
|
{ 0, 1, 2, _, 3, 4, 5, 6, 7, _, 8, _, 9,10,11,12},
|
|
{ 0, _, 1, 2, 3, 4, 5, 6, 7, _, 8, _, 9,10,11,12},
|
|
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, _, 9, _,10,11,12,13},
|
|
{ 0, _, 1, _, 2, _, 3, _, 4, 5, 6, _, 7, 8, 9,10},
|
|
{ 0, 1, 2, _, 3, _, 4, _, 5, 6, 7, _, 8, 9,10,11},
|
|
{ 0, _, 1, 2, 3, _, 4, _, 5, 6, 7, _, 8, 9,10,11},
|
|
{ 0, 1, 2, 3, 4, _, 5, _, 6, 7, 8, _, 9,10,11,12},
|
|
{ 0, _, 1, _, 2, 3, 4, _, 5, 6, 7, _, 8, 9,10,11},
|
|
{ 0, 1, 2, _, 3, 4, 5, _, 6, 7, 8, _, 9,10,11,12},
|
|
{ 0, _, 1, 2, 3, 4, 5, _, 6, 7, 8, _, 9,10,11,12},
|
|
{ 0, 1, 2, 3, 4, 5, 6, _, 7, 8, 9, _,10,11,12,13},
|
|
{ 0, _, 1, _, 2, _, 3, 4, 5, 6, 7, _, 8, 9,10,11},
|
|
{ 0, 1, 2, _, 3, _, 4, 5, 6, 7, 8, _, 9,10,11,12},
|
|
{ 0, _, 1, 2, 3, _, 4, 5, 6, 7, 8, _, 9,10,11,12},
|
|
{ 0, 1, 2, 3, 4, _, 5, 6, 7, 8, 9, _,10,11,12,13},
|
|
{ 0, _, 1, _, 2, 3, 4, 5, 6, 7, 8, _, 9,10,11,12},
|
|
{ 0, 1, 2, _, 3, 4, 5, 6, 7, 8, 9, _,10,11,12,13},
|
|
{ 0, _, 1, 2, 3, 4, 5, 6, 7, 8, 9, _,10,11,12,13},
|
|
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, _,11,12,13,14},
|
|
{ 0, _, 1, _, 2, _, 3, _, 4, _, 5, 6, 7, 8, 9,10},
|
|
{ 0, 1, 2, _, 3, _, 4, _, 5, _, 6, 7, 8, 9,10,11},
|
|
{ 0, _, 1, 2, 3, _, 4, _, 5, _, 6, 7, 8, 9,10,11},
|
|
{ 0, 1, 2, 3, 4, _, 5, _, 6, _, 7, 8, 9,10,11,12},
|
|
{ 0, _, 1, _, 2, 3, 4, _, 5, _, 6, 7, 8, 9,10,11},
|
|
{ 0, 1, 2, _, 3, 4, 5, _, 6, _, 7, 8, 9,10,11,12},
|
|
{ 0, _, 1, 2, 3, 4, 5, _, 6, _, 7, 8, 9,10,11,12},
|
|
{ 0, 1, 2, 3, 4, 5, 6, _, 7, _, 8, 9,10,11,12,13},
|
|
{ 0, _, 1, _, 2, _, 3, 4, 5, _, 6, 7, 8, 9,10,11},
|
|
{ 0, 1, 2, _, 3, _, 4, 5, 6, _, 7, 8, 9,10,11,12},
|
|
{ 0, _, 1, 2, 3, _, 4, 5, 6, _, 7, 8, 9,10,11,12},
|
|
{ 0, 1, 2, 3, 4, _, 5, 6, 7, _, 8, 9,10,11,12,13},
|
|
{ 0, _, 1, _, 2, 3, 4, 5, 6, _, 7, 8, 9,10,11,12},
|
|
{ 0, 1, 2, _, 3, 4, 5, 6, 7, _, 8, 9,10,11,12,13},
|
|
{ 0, _, 1, 2, 3, 4, 5, 6, 7, _, 8, 9,10,11,12,13},
|
|
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, _, 9,10,11,12,13,14},
|
|
{ 0, _, 1, _, 2, _, 3, _, 4, 5, 6, 7, 8, 9,10,11},
|
|
{ 0, 1, 2, _, 3, _, 4, _, 5, 6, 7, 8, 9,10,11,12},
|
|
{ 0, _, 1, 2, 3, _, 4, _, 5, 6, 7, 8, 9,10,11,12},
|
|
{ 0, 1, 2, 3, 4, _, 5, _, 6, 7, 8, 9,10,11,12,13},
|
|
{ 0, _, 1, _, 2, 3, 4, _, 5, 6, 7, 8, 9,10,11,12},
|
|
{ 0, 1, 2, _, 3, 4, 5, _, 6, 7, 8, 9,10,11,12,13},
|
|
{ 0, _, 1, 2, 3, 4, 5, _, 6, 7, 8, 9,10,11,12,13},
|
|
{ 0, 1, 2, 3, 4, 5, 6, _, 7, 8, 9,10,11,12,13,14},
|
|
{ 0, _, 1, _, 2, _, 3, 4, 5, 6, 7, 8, 9,10,11,12},
|
|
{ 0, 1, 2, _, 3, _, 4, 5, 6, 7, 8, 9,10,11,12,13},
|
|
{ 0, _, 1, 2, 3, _, 4, 5, 6, 7, 8, 9,10,11,12,13},
|
|
{ 0, 1, 2, 3, 4, _, 5, 6, 7, 8, 9,10,11,12,13,14},
|
|
{ 0, _, 1, _, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13},
|
|
{ 0, 1, 2, _, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14},
|
|
{ 0, _, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14},
|
|
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15}
|
|
};
|
|
#undef _
|
|
|
|
#define LENBLOCK // All length keys encoded at the beginning of the buffer.
|
|
#ifdef LENBLOCK
|
|
#define OP out
|
|
#define IP in
|
|
#define IPINC 0
|
|
#define DATABEG(_p_,_n_,_usize_) _p_ + V8PAYLOAD(_n_, _usize_)
|
|
#define PNEXT(_p0_,_p_,_i_) _p0_ += _i_
|
|
#define PNEXTA(_p0_,_p_,_i_) 0
|
|
#define PNEXTB(_p0_,_i_) _p0_ += _i_
|
|
#else
|
|
#define OP op
|
|
#define IP ip
|
|
#define IPINC 8
|
|
#define DATABEG(_p_,_n_,_s_) _p_
|
|
#define PNEXT(_p0_,_p_,_i_) _p0_ = _p_
|
|
#define PNEXTA(_p0_,_p_,_i_) _p0_ = _p_, _p_+=_i_
|
|
#define PNEXTB(_p0_,_i_)
|
|
#endif
|
|
|
|
//----------------------------------- Templates parameter macros -----------------------------------------------------------------
|
|
#define V8DELTA32
|
|
#define V8DELTA16
|
|
#define V8ENC v8enc
|
|
#define V8DEC v8dec
|
|
#define VE16(_x_) v = _x_
|
|
#define VD16(_x_) _x_
|
|
#define VE32(_x_) v = _x_
|
|
#define VD32(_x_) _x_
|
|
#define VEINI128v32
|
|
#define VEINI256v32
|
|
#define VE128v32(_v_,_sv_)
|
|
#define VE256v32(_v_,_sv_)
|
|
#define VDINI128v32
|
|
#define VDINI256v32
|
|
#define VD128v32(_ov_,_sv_)
|
|
#define VD256v32(_ov_,_sv_)
|
|
|
|
#define VEINI128v16
|
|
#define VDINI128v16
|
|
#define VE128v16(_ov_,_sv_)
|
|
#define VD128v16(_ov_,_sv_)
|
|
#include "v8.c"
|
|
|
|
#define V8DELTA32 ,uint32_t start
|
|
#define V8DELTA16 ,uint16_t start
|
|
|
|
#define V8ENC v8zenc //------------ zigzag -----------------------------
|
|
#define V8DEC v8zdec
|
|
#define VDELTA 0
|
|
|
|
#define VEINI128v16 __m128i sv = _mm_set1_epi16(start); const __m128i zv = _mm_setzero_si128()
|
|
#define VEINI128v32 __m128i sv = _mm_set1_epi32(start); const __m128i zv = _mm_setzero_si128()
|
|
#define VEINI256v32 __m256i sv = _mm256_set1_epi32(start)
|
|
|
|
#define VE16(_x_) v = zigzagenc16((_x_)-start); start = _x_
|
|
#define VE32(_x_) v = zigzagenc32((_x_)-start); start = _x_
|
|
|
|
#define VD16(_x_) (start += zigzagdec16(_x_))
|
|
#define VD32(_x_) (start += zigzagdec32(_x_))
|
|
|
|
#define VE128v16(_iv_,_sv_) { __m128i _tv = mm_delta_epi16(_iv_,_sv_); _sv_ = _iv_; _iv_ = mm_zzage_epi16(_tv); }
|
|
#define VE128v32(_iv_,_sv_) { __m128i _tv = mm_delta_epi32(_iv_,_sv_); _sv_ = _iv_; _iv_ = mm_zzage_epi32(_tv); }
|
|
#define VE256v32(_iv_,_sv_) { __m256i _tv = mm256_delta_epi32(_iv_,_sv_); _sv_ = _iv_; _iv_ = mm256_zzage_epi32(_tv); }
|
|
|
|
#define VDINI128v16 __m128i sv = _mm_set1_epi16(start); const __m128i zv = _mm_setzero_si128()
|
|
#define VDINI128v32 __m128i sv = _mm_set1_epi32(start); const __m128i zv = _mm_setzero_si128()
|
|
#define VDINI256v32 __m256i sv = _mm256_set1_epi32(start); const __m256i zv = _mm256_setzero_si256()
|
|
|
|
#define VD128v16(_v_,_sv_) _v_ = mm_zzagd_epi16( _v_); _sv_ = mm_scan_epi16(_v_,_sv_); _v_ = _sv_
|
|
#define VD128v32(_v_,_sv_) _v_ = mm_zzagd_epi32( _v_); _sv_ = mm_scan_epi32(_v_,_sv_); _v_ = _sv_
|
|
#define VD256v32(_v_,_sv_) _v_ = mm256_zzagd_epi32(_v_); _sv_ = mm256_scan_epi32(_v_,_sv_); _v_ = _sv_
|
|
|
|
#include "v8.c"
|
|
|
|
#define V8ENC v8xenc //------------ xor -----------------------------
|
|
#define V8DEC v8xdec
|
|
#define VDELTA 0
|
|
|
|
#define VEINI128v16 __m128i sv = _mm_set1_epi16(start);
|
|
#define VEINI128v32 __m128i sv = _mm_set1_epi32(start);
|
|
#define VEINI256v32 __m256i sv = _mm256_set1_epi32(start)
|
|
|
|
#define VE16(_x_) v = (_x_)^start; start = _x_
|
|
#define VE32(_x_) v = (_x_)^start; start = _x_
|
|
|
|
#define VD16(_x_) (start ^= _x_)
|
|
#define VD32(_x_) (start ^= _x_)
|
|
|
|
#define VE128v16(_iv_,_sv_) { __m128i _tv = _mm_xor_si128(_iv_,_sv_); _sv_ = _iv_; _iv_ = _tv; }
|
|
#define VE128v32(_iv_,_sv_) { __m128i _tv = _mm_xor_si128(_iv_,_sv_); _sv_ = _iv_; _iv_ = _tv; }
|
|
#define VE256v32(_iv_,_sv_) { __m256i _tv = _mm256_xor_si256(_iv_,_sv_); _sv_ = _iv_; _iv_ = _tv; }
|
|
|
|
#define VDINI128v16 __m128i sv = _mm_set1_epi16(start);
|
|
#define VDINI128v32 __m128i sv = _mm_set1_epi32(start);
|
|
#define VDINI256v32 __m256i sv = _mm256_set1_epi32(start);
|
|
|
|
#define VD128v16(_v_,_sv_) _v_ = _sv_ = _mm_xor_si128(_v_,_sv_);
|
|
#define VD128v32(_v_,_sv_) _v_ = _sv_ = _mm_xor_si128(_v_,_sv_);
|
|
#define VD256v32(_v_,_sv_) _v_ = _sv_ = _mm256_xor_si256(_v_,_sv_);
|
|
|
|
#include "v8.c"
|
|
|
|
|
|
#define V8ENC v8denc //---------- delta ----------------------------------
|
|
#define V8DEC v8ddec
|
|
#define VE16(_x_) v = (_x_)-start; start = _x_
|
|
#define VE32(_x_) VE16(_x_)
|
|
#define VD16(_x_) (start += _x_)
|
|
#define VD32(_x_) VD16(_x_)
|
|
|
|
#define VEINI128v16 __m128i sv = _mm_set1_epi16(start)
|
|
#define VEINI128v32 __m128i sv = _mm_set1_epi32(start)
|
|
#define VEINI256v32 __m256i sv = _mm256_set1_epi32(start)
|
|
|
|
#define VE128v16(_v_,_sv_) { __m128i _tv = mm_delta_epi16(_v_,_sv_); _sv_ = _v_; _v_ = _tv; }
|
|
#define VE128v32(_v_,_sv_) { __m128i _tv = mm_delta_epi32(_v_,_sv_); _sv_ = _v_; _v_ = _tv; }
|
|
#define VE256v32(_v_,_sv_) { __m256i _tv = mm256_delta_epi32(_v_,_sv_); _sv_ = _v_; _v_ = _tv; }
|
|
|
|
#define VDINI128v16 __m128i sv = _mm_set1_epi16(start); const __m128i zv = _mm_setzero_si128()
|
|
#define VDINI128v32 __m128i sv = _mm_set1_epi32(start); const __m128i zv = _mm_setzero_si128()
|
|
#define VDINI256v32 __m256i sv = _mm256_set1_epi32(start); const __m256i zv = _mm256_setzero_si256()
|
|
|
|
#define VD128v16(_v_,_sv_) _sv_ = mm_scan_epi16(_v_,_sv_); _v_ = _sv_
|
|
#define VD128v32(_v_,_sv_) _sv_ = mm_scan_epi32(_v_,_sv_); _v_ = _sv_
|
|
#define VD256v32(_v_,_sv_) _sv_ = mm256_scan_epi32(_v_,_sv_); _v_ = _sv_
|
|
#include "v8.c"
|
|
|
|
#define V8ENC v8d1enc // delta 1
|
|
#define V8DEC v8d1dec
|
|
#define VDELTA 1
|
|
|
|
#define VE16(_x_) v = (_x_)-start-VDELTA; start = _x_
|
|
#define VE32(_x_) VE16(_x_)
|
|
#define VD16(_x_) (start += _x_+VDELTA)
|
|
#define VD32(_x_) VD16(_x_)
|
|
|
|
#define VEINI128v16 __m128i sv = _mm_set1_epi16(start); const __m128i cv1_16 = _mm_set1_epi16(1)
|
|
#define VEINI128v32 __m128i sv = _mm_set1_epi32(start); const __m128i cv1_32 = _mm_set1_epi32(1)
|
|
#define VEINI256v32 __m256i sv = _mm256_set1_epi32(start); const __m256i cv1_32 = _mm256_set1_epi32(1)
|
|
|
|
#define VE128v16(_v_,_sv_) { __m128i _tv = _mm_sub_epi16(mm_delta_epi16(_v_,_sv_),cv1_16); _sv_ = _v_; _v_ = _tv; }
|
|
#define VE128v32(_v_,_sv_) { __m128i _tv = _mm_sub_epi32(mm_delta_epi32(_v_,_sv_),cv1_32); _sv_ = _v_; _v_ = _tv; }
|
|
#define VE256v32(_v_,_sv_) { __m256i _tv = _mm256_sub_epi32(mm256_delta_epi32(_v_,_sv_),cv1_32); _sv_ = _v_; _v_ = _tv; }
|
|
|
|
#define VDINI128v16 __m128i sv = _mm_set1_epi16(start); const __m128i zv = _mm_setzero_si128(), cvi = _mm_set_epi16(8,7,6,5,4,3,2,1)
|
|
#define VDINI128v32 __m128i sv = _mm_set1_epi32(start); const __m128i zv = _mm_setzero_si128(), cvi = _mm_set_epi32( 4,3,2,1)
|
|
#define VDINI256v32 __m256i sv = _mm256_set1_epi32(start); const __m256i zv = _mm256_setzero_si256(), cvi = _mm256_set_epi32(8,7,6,5,4,3,2,1)
|
|
|
|
#define VD128v16(_v_,_sv_) _sv_ = mm_scani_epi16(_v_,_sv_, cvi); _v_ = _sv_
|
|
#define VD128v32(_v_,_sv_) _sv_ = mm_scani_epi32(_v_,_sv_, cvi); _v_ = _sv_
|
|
#define VD256v32(_v_,_sv_) _sv_ = mm256_scani_epi32(_v_,_sv_, cvi); _v_ = _sv_
|
|
#include "v8.c"
|
|
|
|
#else //---------------------------------------------- Templates -------------------------------------------------------------
|
|
#define BN32(x) (x?(__bsr32(x)/8):0)
|
|
#define VLE1(_m_) { VE32(ip[0]); unsigned _b = BN32(v); ctou32(op) = v; op += _b+1; _m_ |= _b<<((ip-sp)*2); }
|
|
#define VLE4(_i_) { unsigned _b,_m;\
|
|
VE32(ip[_i_+0]); _b = BN32(v); ctou32(op) = v; op += _b+1; _m = _b; \
|
|
VE32(ip[_i_+1]); _b = BN32(v); ctou32(op) = v; op += _b+1; _m |= _b<<2;\
|
|
VE32(ip[_i_+2]); _b = BN32(v); ctou32(op) = v; op += _b+1; _m |= _b<<4;\
|
|
VE32(ip[_i_+3]); _b = BN32(v); ctou32(op) = v; op += _b+1; _m |= _b<<6;\
|
|
*out++ = _m;\
|
|
}
|
|
|
|
#define mm256_packus_epi16(a, b) _mm256_permute4x64_epi64(_mm256_packus_epi16(a, b), _MM_SHUFFLE(3, 1, 2, 0))
|
|
|
|
unsigned char *T2(V8ENC,32)(uint32_t *__restrict in, unsigned n, unsigned char *__restrict out V8DELTA32) {
|
|
uint32_t *ip,v;
|
|
unsigned char *op = DATABEG(out,n,32),*sp = out;
|
|
|
|
#ifdef __AVX2__ // slightly faster than SSE ----------------------------------------------------------------------------------------------
|
|
VEINI256v32; const __m256i cv1_8 = _mm256_set1_epi8(1), cv7f00 = _mm256_set1_epi16(0x7F00), zv = _mm256_setzero_si256();
|
|
for(ip = in; ip != in+(n&~(32-1)); ip += 32) { //PREFETCH(ip+384,0);
|
|
__m256i iv0 = _mm256_loadu_si256(ip ),
|
|
iv1 = _mm256_loadu_si256(ip+ 8); VE256v32(iv0,sv); VE256v32(iv1,sv);
|
|
__m256i iv2 = _mm256_loadu_si256(ip+16),
|
|
iv3 = _mm256_loadu_si256(ip+24); VE256v32(iv2,sv); VE256v32(iv3,sv);
|
|
__m256i mv0 = mm256_packus_epi16(_mm256_min_epu8(iv0,cv1_8), _mm256_min_epu8(iv1,cv1_8)); //mv0 = _mm256_permute4x64_epi64(mv0, _MM_SHUFFLE(3, 1, 2, 0));
|
|
mv0 = _mm256_min_epi16(mv0, cv1_8); mv0 = _mm256_adds_epu16(mv0, cv7f00);
|
|
uint32_t m0 = _mm256_movemask_epi8(mv0);
|
|
|
|
__m256i ov0 = _mm256_castsi128_si256( SVE32(m0 << 4));
|
|
ov0 = _mm256_inserti128_si256(ov0, SVE32(m0 >> 4),1);
|
|
__m256i ov1 = _mm256_castsi128_si256( SVE32(m0 >>12));
|
|
ov1 = _mm256_inserti128_si256(ov1, SVE32(m0 >>20),1);
|
|
|
|
__m256i mv1 = _mm256_packus_epi16(_mm256_min_epu8(iv2,cv1_8), _mm256_min_epu8(iv3,cv1_8)); mv1 = _mm256_permute4x64_epi64(mv1, _MM_SHUFFLE(3, 1, 2, 0));
|
|
mv1 = _mm256_min_epi16(mv1, cv1_8); mv1 = _mm256_adds_epu16(mv1, cv7f00);
|
|
uint32_t m1 = _mm256_movemask_epi8(mv1);
|
|
__m256i ov2 = _mm256_castsi128_si256( SVE32(m1 << 4));
|
|
ov2 = _mm256_inserti128_si256(ov2, SVE32(m1 >> 4),1);
|
|
__m256i ov3 = _mm256_castsi128_si256( SVE32(m1 >>12));
|
|
ov3 = _mm256_inserti128_si256(ov3, SVE32(m1 >>20),1);
|
|
|
|
ov0 = _mm256_shuffle_epi8(iv0,ov0);
|
|
ov1 = _mm256_shuffle_epi8(iv1,ov1);
|
|
|
|
ctou32(OP) = m0; ctou32(OP+4) = m1; OP += 8;
|
|
_mm_storeu_si128((__m128i *)op, _mm256_castsi256_si128( ov0)); op += LEN32(m0,0);
|
|
_mm_storeu_si128((__m128i *)op, _mm256_extracti128_si256(ov0,1)); op += LEN32(m0,1);
|
|
_mm_storeu_si128((__m128i *)op, _mm256_castsi256_si128( ov1)); op += LEN32(m0,2);
|
|
_mm_storeu_si128((__m128i *)op, _mm256_extracti128_si256(ov1,1)); op += LEN32(m0,3);
|
|
|
|
ov2 = _mm256_shuffle_epi8(iv2,ov2);
|
|
ov3 = _mm256_shuffle_epi8(iv3,ov3);
|
|
_mm_storeu_si128((__m128i *)op, _mm256_castsi256_si128( ov2)); op += LEN32(m1,0);
|
|
_mm_storeu_si128((__m128i *)op, _mm256_extracti128_si256(ov2,1)); op += LEN32(m1,1);
|
|
_mm_storeu_si128((__m128i *)op, _mm256_castsi256_si128( ov3)); op += LEN32(m1,2);
|
|
_mm_storeu_si128((__m128i *)op, _mm256_extracti128_si256(ov3,1)); op += LEN32(m1,3);
|
|
}
|
|
#elif defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64) // https://gist.github.com/aqrit/746d2f5e4ad1909230e2283272333dc1
|
|
VEINI128v32; const __m128i cv1_8 = _mm_set1_epi8(1), cv7f00 = _mm_set1_epi16(0x7f00);
|
|
for(ip = in; ip != in+(n&~(32-1)); ip += 32, PNEXT(out,op,8) ) {
|
|
__m128i iv0 = _mm_loadu_si128(ip ),
|
|
iv1 = _mm_loadu_si128(ip+ 4);
|
|
__m128i iv2 = _mm_loadu_si128(ip+ 8),
|
|
iv3 = _mm_loadu_si128(ip+12); VE128v32(iv0,sv); VE128v32(iv1,sv); VE128v32(iv2,sv); VE128v32(iv3,sv); //delta,zigzag,...
|
|
__m128i mv0 = _mm_packus_epi16(_mm_min_epu8(iv0,cv1_8), _mm_min_epu8(iv1,cv1_8)); mv0 = _mm_min_epi16( mv0, cv1_8); mv0 = _mm_adds_epu16(mv0, cv7f00);
|
|
__m128i mv1 = _mm_packus_epi16(_mm_min_epu8(iv2,cv1_8), _mm_min_epu8(iv3,cv1_8)); mv1 = _mm_min_epi16( mv1, cv1_8); mv1 = _mm_adds_epu16(mv1, cv7f00);
|
|
uint16_t m0 = _mm_movemask_epi8(mv0);
|
|
uint16_t m1 = _mm_movemask_epi8(mv1);
|
|
__m128i iv4 = _mm_loadu_si128(ip+16),
|
|
iv5 = _mm_loadu_si128(ip+20);
|
|
__m128i iv6 = _mm_loadu_si128(ip+24),
|
|
iv7 = _mm_loadu_si128(ip+28); VE128v32(iv4,sv); VE128v32(iv5,sv); VE128v32(iv6,sv); VE128v32(iv7,sv);
|
|
__m128i mv2 = _mm_packus_epi16(_mm_min_epu8(iv4,cv1_8), _mm_min_epu8(iv5,cv1_8)); mv2 = _mm_min_epi16( mv2, cv1_8); mv2 = _mm_adds_epu16(mv2, cv7f00);
|
|
__m128i mv3 = _mm_packus_epi16(_mm_min_epu8(iv6,cv1_8), _mm_min_epu8(iv7,cv1_8)); mv3 = _mm_min_epi16( mv3, cv1_8); mv3 = _mm_adds_epu16(mv3, cv7f00);
|
|
uint16_t m2 = _mm_movemask_epi8(mv2);
|
|
uint16_t m3 = _mm_movemask_epi8(mv3);
|
|
|
|
{ __m128i ov0 = _mm_shuffle_epi8(iv0, SVE32(m0 << 4)),
|
|
ov1 = _mm_shuffle_epi8(iv1, SVE32(m0 >> 4));
|
|
__m128i ov2 = _mm_shuffle_epi8(iv2, SVE32(m1 << 4)),
|
|
ov3 = _mm_shuffle_epi8(iv3, SVE32(m1 >> 4));
|
|
|
|
ctou32(out) = (unsigned)m1<<16|m0;
|
|
_mm_storeu_si128((__m128i *)(op+IPINC), ov0); op += LEN32(m0,0)+IPINC;
|
|
_mm_storeu_si128((__m128i *)op, ov1); op += LEN32(m0,1);
|
|
_mm_storeu_si128((__m128i *)op, ov2); op += LEN32(m1,0);
|
|
_mm_storeu_si128((__m128i *)op, ov3); op += LEN32(m1,1);
|
|
}
|
|
ctou32(out+4) = (unsigned)m3<<16|m2;
|
|
{ __m128i ov0 = _mm_shuffle_epi8(iv4, SVE32(m2 << 4)),
|
|
ov1 = _mm_shuffle_epi8(iv5, SVE32(m2 >> 4));
|
|
__m128i ov2 = _mm_shuffle_epi8(iv6, SVE32(m3 << 4)),
|
|
ov3 = _mm_shuffle_epi8(iv7, SVE32(m3 >> 4));
|
|
_mm_storeu_si128((__m128i *)op, ov0); op += LEN32(m2,0);
|
|
_mm_storeu_si128((__m128i *)op, ov1); op += LEN32(m2,1);
|
|
_mm_storeu_si128((__m128i *)op, ov2); op += LEN32(m3,0);
|
|
_mm_storeu_si128((__m128i *)op, ov3); op += LEN32(m3,1);
|
|
} //PREFETCH(ip+384,0);
|
|
}
|
|
#else //------------------------------ scalar ----------------------------------------------
|
|
for(ip = in; ip != in+(n&~(32-1)); ip += 32) { PNEXTA(out,op,8); VLE4( 0); VLE4( 4); VLE4( 8); VLE4(12); VLE4(16); VLE4(20); VLE4(24); VLE4(28); /*PREFETCH(ip+512,0);*/ }
|
|
#endif
|
|
for( ; ip != in+(n&~(4-1)); ip += 4) { PNEXTA(out,op,1); VLE4(0); }
|
|
if(ip != in+n) { uint32_t *sp = ip; for(*OP=0,PNEXTA(out,op,1); ip != in+n; ip++ ) VLE1(out[0]); }
|
|
return op;
|
|
}
|
|
|
|
#define VLD1(_i_) { unsigned _b = ((m>>((op-sp)*2))& 3)+1; v = ctou32(ip) & ((1ull<<(_b*8))-1); *op = VD32(v); ip+=_b; }
|
|
#define VLD4(_i_) { unsigned _b,m = *in++;\
|
|
_b = (m & 3)+1; v = ctou32(ip) & ((1ull<<(_b*8))-1); op[_i_+0] = VD32(v); ip+=_b;\
|
|
_b = ((m>>2)& 3)+1; v = ctou32(ip) & ((1ull<<(_b*8))-1); op[_i_+1] = VD32(v); ip+=_b;\
|
|
_b = ((m>>4)& 3)+1; v = ctou32(ip) & ((1ull<<(_b*8))-1); op[_i_+2] = VD32(v); ip+=_b;\
|
|
_b = ((m>>6)& 3)+1; v = ctou32(ip) & ((1ull<<(_b*8))-1); op[_i_+3] = VD32(v); ip+=_b;\
|
|
}
|
|
|
|
unsigned char *T2(V8DEC,32)(unsigned char *__restrict in, unsigned n, uint32_t *__restrict out V8DELTA32) {
|
|
uint32_t *op=out, v;
|
|
unsigned char *ip = DATABEG(in,n,32);
|
|
if(!n) return in;
|
|
#ifdef __AVX2__ //slightly faster than SSE ------------------------------------------------------------------------------------------
|
|
VDINI256v32;
|
|
uint64_t mx = ctou64(IP);
|
|
for(; op != out+(n&~(32-1)); op += 32) { PREFETCH(ip+384,0);
|
|
uint64_t m = mx; mx = ctou64(IP+=8);
|
|
{__m256i ov0 = _mm256_castsi128_si256( _mm_loadu_si128(ip)); ip += LEN32(m,0);
|
|
ov0 = _mm256_inserti128_si256(ov0, _mm_loadu_si128(ip),1); ip += LEN32(m,1);
|
|
__m256i fv0 = _mm256_castsi128_si256( SVD32(m,0));
|
|
fv0 = _mm256_inserti128_si256(fv0, SVD32(m,1),1); m>>=16;
|
|
__m256i ov1 = _mm256_castsi128_si256( _mm_loadu_si128(ip)); ip += LEN32(m,0);
|
|
ov1 = _mm256_inserti128_si256(ov1, _mm_loadu_si128(ip),1); ip += LEN32(m,1);
|
|
__m256i fv1 = _mm256_castsi128_si256( SVD32(m,0));
|
|
fv1 = _mm256_inserti128_si256(fv1, SVD32(m,1),1); m>>=16;
|
|
ov0 = _mm256_shuffle_epi8(ov0, fv0);
|
|
ov1 = _mm256_shuffle_epi8(ov1, fv1); VD256v32(ov0,sv); VD256v32(ov1,sv);
|
|
_mm256_storeu_si256(op, ov0);
|
|
_mm256_storeu_si256(op+8, ov1);
|
|
}{
|
|
__m256i ov2 = _mm256_castsi128_si256( _mm_loadu_si128(ip)); ip += LEN32(m,0);
|
|
ov2 = _mm256_inserti128_si256(ov2, _mm_loadu_si128(ip),1); ip += LEN32(m,1);
|
|
__m256i fv2 = _mm256_castsi128_si256( SVD32(m,0));
|
|
fv2 = _mm256_inserti128_si256(fv2, SVD32(m,1),1); m>>=16;
|
|
__m256i ov3 = _mm256_castsi128_si256( _mm_loadu_si128(ip)); ip += LEN32(m,0);
|
|
ov3 = _mm256_inserti128_si256(ov3, _mm_loadu_si128(ip),1); ip += LEN32(m,1);
|
|
|
|
__m256i fv3 = _mm256_castsi128_si256( SVD32(m,0));
|
|
fv3 = _mm256_inserti128_si256(fv3, SVD32(m,1),1);
|
|
ov2 = _mm256_shuffle_epi8(ov2, fv2);
|
|
ov3 = _mm256_shuffle_epi8(ov3, fv3); VD256v32(ov2,sv); VD256v32(ov3,sv);
|
|
_mm256_storeu_si256(op+16, ov2);
|
|
_mm256_storeu_si256(op+24, ov3);
|
|
}
|
|
}
|
|
#elif defined(__SSSE3__) // optimzed for x86
|
|
VDINI128v32;
|
|
if(n >= 32) {
|
|
uint64_t mx = ctou64(IP);
|
|
for(; op != out+(n&~(32-1)); op += 32) {
|
|
uint64_t m = mx; mx = ctou64(IP+=8);
|
|
{__m128i ou0 = _mm_loadu_si128(ip); ip += LEN32((uint8_t )m,0);
|
|
__m128i ou1 = _mm_loadu_si128(ip); ip += LEN32((uint16_t)m,1);
|
|
__m128i su0 = SVD32((uint8_t )m,0);
|
|
__m128i su1 = SVD32((uint16_t)m,1); m>>=16;
|
|
|
|
__m128i ou2 = _mm_loadu_si128(ip); ip += LEN32((uint8_t )m,0);
|
|
__m128i ou3 = _mm_loadu_si128(ip); ip += LEN32((uint16_t)m,1);
|
|
__m128i su2 = SVD32((uint8_t )m,0);
|
|
__m128i su3 = SVD32((uint16_t)m,1); m>>=16;
|
|
|
|
ou0 = _mm_shuffle_epi8(ou0, su0);
|
|
ou1 = _mm_shuffle_epi8(ou1, su1);
|
|
ou2 = _mm_shuffle_epi8(ou2, su2);
|
|
ou3 = _mm_shuffle_epi8(ou3, su3);
|
|
|
|
VD128v32(ou0,sv); _mm_storeu_si128(op, ou0);
|
|
VD128v32(ou1,sv); _mm_storeu_si128(op+ 4, ou1);
|
|
VD128v32(ou2,sv); _mm_storeu_si128(op+ 8, ou2);
|
|
VD128v32(ou3,sv); _mm_storeu_si128(op+12, ou3);
|
|
}{
|
|
__m128i ov0 = _mm_loadu_si128(ip); ip += LEN32((uint8_t )m,0);
|
|
__m128i ov1 = _mm_loadu_si128(ip); ip += LEN32((uint16_t)m,1);
|
|
__m128i sv0 = SVD32((uint8_t )m,0);
|
|
__m128i sv1 = SVD32((uint16_t)m,1); m>>=16;
|
|
__m128i ov2 = _mm_loadu_si128(ip); ip += LEN32((uint8_t )m,0);
|
|
__m128i ov3 = _mm_loadu_si128(ip); ip += LEN32((uint16_t)m,1);
|
|
__m128i sv2 = SVD32(m,0);
|
|
__m128i sv3 = SVD32(m,1);
|
|
|
|
ov0 = _mm_shuffle_epi8(ov0, sv0);
|
|
ov1 = _mm_shuffle_epi8(ov1, sv1);
|
|
ov2 = _mm_shuffle_epi8(ov2, sv2);
|
|
ov3 = _mm_shuffle_epi8(ov3, sv3);
|
|
|
|
VD128v32(ov0,sv); _mm_storeu_si128(op+16, ov0);
|
|
VD128v32(ov1,sv); _mm_storeu_si128(op+20, ov1);
|
|
VD128v32(ov2,sv); _mm_storeu_si128(op+24, ov2);
|
|
VD128v32(ov3,sv); _mm_storeu_si128(op+28, ov3); PREFETCH(ip+384,0);
|
|
}
|
|
}
|
|
}
|
|
#elif defined(__ARM_NEON) || defined(__loongarch_lp64) || defined(__SSSE3__) // optimzed for ARM ----------------------------------------------------------
|
|
VDINI128v32;
|
|
for(; op != out+(n&~(32-1)); op += 32) { //PREFETCH(ip+384,0);
|
|
uint32_t m0 = ctou32(IP), m1 = ctou32(IP+4);
|
|
__m128i ov0 = _mm_loadu_si128(ip+IPINC); ip += LEN32(m0,0)+IPINC;
|
|
__m128i fv0 = SVD32(m0,0);
|
|
__m128i ov1 = _mm_loadu_si128(ip); ip += LEN32(m0,1);
|
|
__m128i fv1 = SVD32(m0,1);
|
|
__m128i ov2 = _mm_loadu_si128(ip); ip += LEN32(m0,2);
|
|
__m128i fv2 = SVD32(m0,2);
|
|
__m128i ov3 = _mm_loadu_si128(ip); ip += LEN32(m0,3);
|
|
__m128i fv3 = SVD32(m0,3);
|
|
ov0 = _mm_shuffle_epi8( ov0, fv0);
|
|
ov1 = _mm_shuffle_epi8( ov1, fv1);
|
|
ov2 = _mm_shuffle_epi8( ov2, fv2);
|
|
ov3 = _mm_shuffle_epi8( ov3, fv3);
|
|
|
|
__m128i fv4 = SVD32(m1,0);
|
|
__m128i ov4 = _mm_loadu_si128(ip); ip += LEN32(m1,0);
|
|
__m128i fv5 = SVD32(m1,1);
|
|
__m128i ov5 = _mm_loadu_si128(ip); ip += LEN32(m1,1);
|
|
__m128i fv6 = SVD32(m1,2);
|
|
__m128i ov6 = _mm_loadu_si128(ip); ip += LEN32(m1,2);
|
|
__m128i fv7 = SVD32(m1,3);
|
|
__m128i ov7 = _mm_loadu_si128(ip); ip += LEN32(m1,3);
|
|
|
|
ov4 = _mm_shuffle_epi8( ov4, fv4);
|
|
ov5 = _mm_shuffle_epi8( ov5, fv5);
|
|
ov6 = _mm_shuffle_epi8( ov6, fv6);
|
|
ov7 = _mm_shuffle_epi8( ov7, fv7);
|
|
VD128v32(ov0,sv); VD128v32(ov1,sv); VD128v32(ov2,sv); VD128v32(ov3,sv);
|
|
VD128v32(ov4,sv); VD128v32(ov5,sv); VD128v32(ov6,sv); VD128v32(ov7,sv); //delta,zigzag,....
|
|
_mm_storeu_si128(op, ov0);
|
|
_mm_storeu_si128(op+4, ov1);
|
|
_mm_storeu_si128(op+8, ov2);
|
|
_mm_storeu_si128(op+12, ov3);
|
|
_mm_storeu_si128(op+16, ov4);
|
|
_mm_storeu_si128(op+20, ov5);
|
|
_mm_storeu_si128(op+24, ov6);
|
|
_mm_storeu_si128(op+28, ov7);
|
|
PNEXTB(in,8);
|
|
}
|
|
#else //----------------------------- scalar -----------------------------------------------
|
|
for(; op != out+(n&~(32-1)); op += 32) { in = ip; ip+=8;
|
|
VLD4( 0); VLD4( 4); VLD4( 8); VLD4(12); VLD4(16); VLD4(20); VLD4(24); VLD4(28); //PREFETCH(ip+512,0);
|
|
}
|
|
#endif
|
|
uint32_t m; for(; op != out+(n&~(4-1)); op += 4) { PNEXTA(in,ip,1); VLD4( 0); }
|
|
if(op != out+n) { uint32_t *sp = op; for(m = *IP++; op != out+n; op++ ) VLD1( 0);}
|
|
return ip;
|
|
}
|
|
|
|
//------------------------------------ 16 bits ---------------------------------------------------------------------
|
|
#define LEN16(_m_,_i_) (8+popcnt32((uint8_t)(_m_>>(_i_<<3))))
|
|
|
|
#define BN16(_x_) ((_x_)>0xff?1:0)
|
|
#define VLE1(_m_) { VE16(ip[0]); unsigned _b = BN16(v); ctou16(op) = v; op += _b+1; _m_ |= _b<<(ip-sp); }
|
|
|
|
#define VLE8(_i_) { unsigned _b,_m; PNEXTA(out,op,1);\
|
|
VE16(ip[_i_+0]); _b = BN16(v); ctou16(op) = v; op += _b+1; _m = _b; \
|
|
VE16(ip[_i_+1]); _b = BN16(v); ctou16(op) = v; op += _b+1; _m |= _b<<1; \
|
|
VE16(ip[_i_+2]); _b = BN16(v); ctou16(op) = v; op += _b+1; _m |= _b<<2; \
|
|
VE16(ip[_i_+3]); _b = BN16(v); ctou16(op) = v; op += _b+1; _m |= _b<<3; \
|
|
VE16(ip[_i_+4]); _b = BN16(v); ctou16(op) = v; op += _b+1; _m |= _b<<4; \
|
|
VE16(ip[_i_+5]); _b = BN16(v); ctou16(op) = v; op += _b+1; _m |= _b<<5; \
|
|
VE16(ip[_i_+6]); _b = BN16(v); ctou16(op) = v; op += _b+1; _m |= _b<<6; \
|
|
VE16(ip[_i_+7]); _b = BN16(v); ctou16(op) = v; op += _b+1; _m |= _b<<7; \
|
|
*out++ = _m;\
|
|
}
|
|
|
|
unsigned char *T2(V8ENC,16)(uint16_t *__restrict in, unsigned n, unsigned char *__restrict out V8DELTA16) {
|
|
uint16_t *ip,v;
|
|
unsigned char *op = DATABEG(out,n,16);
|
|
|
|
#if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64) //--------------------------------
|
|
VEINI128v16; const __m128i cv1_8 = _mm_set1_epi8(1);
|
|
for(ip = in; ip != in+(n&~(64-1)); ip += 64, PNEXT(out,op,8)) { //PREFETCH(ip+512,0);
|
|
__m128i iv0 = _mm_loadu_si128(ip ),
|
|
iv1 = _mm_loadu_si128(ip+ 8);
|
|
__m128i iv2 = _mm_loadu_si128(ip+16),
|
|
iv3 = _mm_loadu_si128(ip+24); VE128v16(iv0,sv); VE128v16(iv1,sv); VE128v16(iv2,sv); VE128v16(iv3,sv);
|
|
__m128i mv0 = _mm_packus_epi16(_mm_min_epu8(iv0,cv1_8), _mm_min_epu8(iv1,cv1_8));
|
|
__m128i mv1 = _mm_packus_epi16(_mm_min_epu8(iv2,cv1_8), _mm_min_epu8(iv3,cv1_8));
|
|
uint16_t m0 = _mm_movemask_epi8(mv0), m1 = _mm_movemask_epi8(mv1);
|
|
ctou16(out) = m0; ctou16(out+2) = m1;
|
|
__m128i iv4 = _mm_loadu_si128(ip+32),
|
|
iv5 = _mm_loadu_si128(ip+40);
|
|
__m128i iv6 = _mm_loadu_si128(ip+48),
|
|
iv7 = _mm_loadu_si128(ip+56); VE128v16(iv4,sv); VE128v16(iv5,sv);VE128v16(iv6,sv); VE128v16(iv7,sv);
|
|
__m128i mv4 = _mm_packus_epi16(_mm_min_epu8(iv4,cv1_8), _mm_min_epu8(iv5,cv1_8));
|
|
__m128i mv5 = _mm_packus_epi16(_mm_min_epu8(iv6,cv1_8), _mm_min_epu8(iv7,cv1_8));
|
|
uint16_t m2 = _mm_movemask_epi8(mv4), m3 = _mm_movemask_epi8(mv5);
|
|
|
|
__m128i ov0 = _mm_shuffle_epi8(iv0, SVE16(m0 << 4)),
|
|
ov1 = _mm_shuffle_epi8(iv1, SVE16(m0 >> 4)),
|
|
ov2 = _mm_shuffle_epi8(iv2, SVE16(m1 << 4)),
|
|
ov3 = _mm_shuffle_epi8(iv3, SVE16(m1 >> 4));
|
|
_mm_storeu_si128((__m128i *)(op+IPINC), ov0); op += LEN16(m0,0)+IPINC;
|
|
_mm_storeu_si128((__m128i *)op, ov1); op += LEN16(m0,1);
|
|
_mm_storeu_si128((__m128i *)op, ov2); op += LEN16(m1,0);
|
|
_mm_storeu_si128((__m128i *)op, ov3); op += LEN16(m1,1);
|
|
|
|
__m128i ov4 = _mm_shuffle_epi8(iv4, SVE16(m2 << 4)),
|
|
ov5 = _mm_shuffle_epi8(iv5, SVE16(m2 >> 4)),
|
|
ov6 = _mm_shuffle_epi8(iv6, SVE16(m3 << 4)),
|
|
ov7 = _mm_shuffle_epi8(iv7, SVE16(m3 >> 4));
|
|
_mm_storeu_si128((__m128i *)op, ov4); op += LEN16(m2,0);
|
|
_mm_storeu_si128((__m128i *)op, ov5); op += LEN16(m2,1);
|
|
_mm_storeu_si128((__m128i *)op, ov6); op += LEN16(m3,0);
|
|
_mm_storeu_si128((__m128i *)op, ov7); op += LEN16(m3,1);
|
|
ctou16(out+4) = m2; ctou16(out+6) = m3;
|
|
}
|
|
#else //---------------------- scalar ---------------------------------------
|
|
for(ip = in; ip != in+(n&~(64-1)); ip += 64) { //PREFETCH(ip+512,0);
|
|
op += 8;
|
|
VLE8( 0); VLE8( 8); VLE8(16); VLE8(24); VLE8(32); VLE8(40); VLE8(48); VLE8(56);
|
|
out = op;
|
|
}
|
|
#endif
|
|
for( ; ip != in+(n&~(8-1)); ip += 8) VLE8(0);
|
|
if(ip != in+n) { uint16_t *sp = ip; for(PNEXTA(out,op,1),*out=0; ip != in+n; ip++ ) VLE1(out[0]); }
|
|
return op;
|
|
}
|
|
|
|
#define VLD1(_i_) { unsigned _b = ((m>>(op-sp))& 1)+1; v = ctou16(ip) & ((1<<(_b*8))-1); *op = VD16(v); ip+=_b; }
|
|
|
|
#define VLD8(_i_) { unsigned _b,m = *IP++;\
|
|
_b = (m & 1)+1; v = ctou16(ip) & ((1<<(_b*8))-1); op[_i_+0] = VD16(v); ip+=_b;\
|
|
_b = ((m>>1)& 1)+1; v = ctou16(ip) & ((1<<(_b*8))-1); op[_i_+1] = VD16(v); ip+=_b;\
|
|
_b = ((m>>2)& 1)+1; v = ctou16(ip) & ((1<<(_b*8))-1); op[_i_+2] = VD16(v); ip+=_b;\
|
|
_b = ((m>>3)& 1)+1; v = ctou16(ip) & ((1<<(_b*8))-1); op[_i_+3] = VD16(v); ip+=_b;\
|
|
_b = ((m>>4)& 1)+1; v = ctou16(ip) & ((1<<(_b*8))-1); op[_i_+4] = VD16(v); ip+=_b;\
|
|
_b = ((m>>5)& 1)+1; v = ctou16(ip) & ((1<<(_b*8))-1); op[_i_+5] = VD16(v); ip+=_b;\
|
|
_b = ((m>>6)& 1)+1; v = ctou16(ip) & ((1<<(_b*8))-1); op[_i_+6] = VD16(v); ip+=_b;\
|
|
_b = ((m>>7)& 1)+1; v = ctou16(ip) & ((1<<(_b*8))-1); op[_i_+7] = VD16(v); ip+=_b;\
|
|
}
|
|
|
|
unsigned char *T2(V8DEC,16)(unsigned char *__restrict in, unsigned n, uint16_t *__restrict out V8DELTA16) {
|
|
uint16_t *op;
|
|
unsigned char *ip = DATABEG(in,n,16);
|
|
uint16_t v;
|
|
|
|
#if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)//-----------------------
|
|
VDINI128v16;
|
|
for(op = out; op != out+(n&~(64-1)); op += 64) { PREFETCH(ip+512,0);
|
|
uint32_t m0 = ctou32(IP), m1 = ctou32(IP+4);
|
|
__m128i ov0 = _mm_shuffle_epi8(_mm_loadu_si128(ip+IPINC), SVD16(m0,0)); ip += LEN16(m0,0)+IPINC;
|
|
__m128i ov1 = _mm_shuffle_epi8(_mm_loadu_si128(ip), SVD16(m0,1)); ip += LEN16(m0,1);
|
|
__m128i ov2 = _mm_shuffle_epi8(_mm_loadu_si128(ip), SVD16(m0,2)); ip += LEN16(m0,2);
|
|
__m128i ov3 = _mm_shuffle_epi8(_mm_loadu_si128(ip), SVD16(m0,3)); ip += LEN16(m0,3);
|
|
VD128v16(ov0,sv); VD128v16(ov1,sv); VD128v16(ov2,sv); VD128v16(ov3,sv);
|
|
_mm_storeu_si128(op, ov0);
|
|
_mm_storeu_si128(op+8, ov1);
|
|
_mm_storeu_si128(op+16, ov2);
|
|
_mm_storeu_si128(op+24, ov3);
|
|
|
|
__m128i ov4 = _mm_shuffle_epi8(_mm_loadu_si128(ip), SVD16(m1,0)); ip += LEN16(m1,0);
|
|
__m128i ov5 = _mm_shuffle_epi8(_mm_loadu_si128(ip), SVD16(m1,1)); ip += LEN16(m1,1);
|
|
__m128i ov6 = _mm_shuffle_epi8(_mm_loadu_si128(ip), SVD16(m1,2)); ip += LEN16(m1,2);
|
|
__m128i ov7 = _mm_shuffle_epi8(_mm_loadu_si128(ip), SVD16(m1,3)); ip += LEN16(m1,3);
|
|
VD128v16(ov4,sv); VD128v16(ov5,sv); VD128v16(ov6,sv); VD128v16(ov7,sv);
|
|
_mm_storeu_si128(op+32, ov4);
|
|
_mm_storeu_si128(op+40, ov5);
|
|
_mm_storeu_si128(op+48, ov6);
|
|
_mm_storeu_si128(op+56, ov7);
|
|
PNEXTB(in,8);
|
|
}
|
|
#else //-------------- scalar --------------------------------------------------------
|
|
for(op = out; op != out+(n&~(64-1)); op += 64) { ip += 8;
|
|
VLD8( 0); VLD8( 8); VLD8(16); VLD8(24); VLD8(32); VLD8(40); VLD8(48); VLD8(56); PREFETCH(ip+512,0);
|
|
in = ip;
|
|
}
|
|
#endif
|
|
uint32_t m; for(; op != out+(n&~(8-1)); op += 8) VLD8( 0);
|
|
if(op != out+n) { uint16_t *sp = op; for(m = *IP++; op != out+n; op++ ) VLD1( 0);}
|
|
return ip;
|
|
}
|
|
#endif
|