57 lines
1.6 KiB
C
57 lines
1.6 KiB
C
#include "../include/simdcomputil.h"
|
|
|
|
__attribute__((always_inline))
|
|
static inline __m128i Delta(__m128i curr, __m128i prev) {
|
|
return _mm_sub_epi32(curr,
|
|
_mm_or_si128(_mm_slli_si128(curr, 4), _mm_srli_si128(prev, 12)));
|
|
}
|
|
|
|
|
|
// returns the integer logarithm of v (bit width)
|
|
uint32_t bits(const uint32_t v) {
|
|
#ifdef _MSC_VER
|
|
if (v == 0) {
|
|
return 0;
|
|
}
|
|
unsigned long answer;
|
|
_BitScanReverse(&answer, v);
|
|
return answer + 1;
|
|
#else
|
|
return v == 0 ? 0 : 32 - __builtin_clz(v); // assume GCC-like compiler if not microsoft
|
|
#endif
|
|
}
|
|
|
|
__attribute__ ((pure))
|
|
uint32_t maxbits(const uint32_t * begin) {
|
|
uint32_t accumulator = 0;const uint32_t * k;
|
|
for (k = begin; k != begin + SIMDBlockSize; ++k) {
|
|
accumulator |= *k;
|
|
}
|
|
return bits(accumulator);
|
|
}
|
|
|
|
static uint32_t maxbitas32int(const __m128i accumulator) {
|
|
uint32_t tmparray[4];
|
|
_mm_storeu_si128((__m128i *) (tmparray), accumulator);
|
|
return bits(tmparray[0] | tmparray[1] | tmparray[2] | tmparray[3]);
|
|
}
|
|
|
|
|
|
// maxbit over 128 integers (SIMDBlockSize) with provided initial value
|
|
uint32_t simdmaxbitsd1(uint32_t initvalue, const uint32_t * in) {
|
|
__m128i initoffset = _mm_set1_epi32 (initvalue);
|
|
const __m128i* pin = (const __m128i*)(in);
|
|
__m128i newvec = _mm_loadu_si128(pin);
|
|
__m128i accumulator = Delta(newvec , initoffset);
|
|
__m128i oldvec = newvec;
|
|
uint32_t k;
|
|
for(k = 1; 4*k < SIMDBlockSize; ++k) {
|
|
newvec = _mm_loadu_si128(pin+k);
|
|
accumulator = _mm_or_si128(accumulator,Delta(newvec , oldvec));
|
|
oldvec = newvec;
|
|
}
|
|
initoffset = oldvec;
|
|
return maxbitas32int(accumulator);
|
|
}
|
|
|