408 lines
12 KiB
C++
408 lines
12 KiB
C++
/**
|
|
* This code is released under the
|
|
* Apache License Version 2.0 http://www.apache.org/licenses/.
|
|
*
|
|
* (c) Daniel Lemire, http://lemire.me/en/
|
|
* and Owen Kaser
|
|
*/
|
|
|
|
#ifndef UTIL
|
|
#define UTIL
|
|
#include "common.h"
|
|
|
|
#ifdef __linux__
|
|
#define USE_O_DIRECT
|
|
#endif
|
|
|
|
namespace FastPForLib {
|
|
|
|
//#define STATS
|
|
// taken from stackoverflow
|
|
#ifndef NDEBUG
|
|
#define ASSERT(condition, message) \
|
|
do { \
|
|
if (!(condition)) { \
|
|
std::cerr << "Assertion `" #condition "` failed in " << __FILE__ \
|
|
<< " line " << __LINE__ << ": " << message << std::endl; \
|
|
std::exit(EXIT_FAILURE); \
|
|
} \
|
|
} while (false)
|
|
#else
|
|
#define ASSERT(condition, message) \
|
|
do { \
|
|
} while (false)
|
|
#endif
|
|
|
|
/**
|
|
* Computes the greatest common divisor
|
|
*/
|
|
constexpr __attribute__((const)) uint32_t gcd(uint32_t x, uint32_t y) {
|
|
return (x % y) == 0 ? y : gcd(y, x % y);
|
|
}
|
|
|
|
template <class T> __attribute__((const)) T *padTo32bits(T *inbyte) {
|
|
return reinterpret_cast<T *>((reinterpret_cast<uintptr_t>(inbyte) + 3) & ~3);
|
|
}
|
|
|
|
template <class T>
|
|
__attribute__((const)) const T *padTo32bits(const T *inbyte) {
|
|
return reinterpret_cast<const T *>((reinterpret_cast<uintptr_t>(inbyte) + 3) &
|
|
~3);
|
|
}
|
|
|
|
template <class T> __attribute__((const)) T *padTo64bits(T *inbyte) {
|
|
return reinterpret_cast<T *>((reinterpret_cast<uintptr_t>(inbyte) + 7) & ~7);
|
|
}
|
|
|
|
template <class T>
|
|
__attribute__((const)) const T *padTo64bits(const T *inbyte) {
|
|
return reinterpret_cast<const T *>((reinterpret_cast<uintptr_t>(inbyte) + 7) &
|
|
~7);
|
|
}
|
|
|
|
template <class T> __attribute__((const)) T *padTo128bits(T *inbyte) {
|
|
return reinterpret_cast<T *>((reinterpret_cast<uintptr_t>(inbyte) + 15) &
|
|
~15);
|
|
}
|
|
|
|
template <class T>
|
|
__attribute__((const)) const T *padTo128bits(const T *inbyte) {
|
|
return reinterpret_cast<const T *>(
|
|
(reinterpret_cast<uintptr_t>(inbyte) + 15) & ~15);
|
|
}
|
|
|
|
template <class T> __attribute__((const)) T *padTo64bytes(T *inbyte) {
|
|
return reinterpret_cast<T *>((reinterpret_cast<uintptr_t>(inbyte) + 63) &
|
|
~63);
|
|
}
|
|
|
|
template <class T>
|
|
__attribute__((const)) const T *padTo64bytes(const T *inbyte) {
|
|
return reinterpret_cast<T *>((reinterpret_cast<uintptr_t>(inbyte) + 63) &
|
|
~63);
|
|
}
|
|
|
|
template <class T>
|
|
__attribute__((const)) bool needPaddingTo32Bits(const T *inbyte) {
|
|
return (reinterpret_cast<uintptr_t>(inbyte) & 3) != 0;
|
|
}
|
|
|
|
template <class T>
|
|
__attribute__((const)) bool needPaddingTo64Bits(const T *inbyte) {
|
|
return (reinterpret_cast<uintptr_t>(inbyte) & 7) != 0;
|
|
}
|
|
|
|
template <class T>
|
|
__attribute__((const)) bool needPaddingTo128Bits(const T *inbyte) {
|
|
return (reinterpret_cast<uintptr_t>(inbyte) & 15) != 0;
|
|
}
|
|
|
|
template <class T> bool needPaddingTo64bytes(const T *inbyte) {
|
|
return (reinterpret_cast<uintptr_t>(inbyte) & 63) != 0;
|
|
}
|
|
|
|
__attribute__((const)) inline uint32_t gccbits(const uint32_t v) {
|
|
#ifdef _MSC_VER
|
|
if (v == 0) {
|
|
return 0;
|
|
}
|
|
unsigned long answer;
|
|
_BitScanReverse(&answer, v);
|
|
return answer + 1;
|
|
#else
|
|
return v == 0 ? 0 : 32 - __builtin_clz(v);
|
|
#endif
|
|
}
|
|
|
|
#ifdef _MSC_VER
|
|
// taken from
|
|
// http://stackoverflow.com/questions/355967/how-to-use-msvc-intrinsics-to-get-the-equivalent-of-this-gcc-code
|
|
uint32_t __builtin_clz(uint32_t x) {
|
|
unsigned long r = 0;
|
|
_BitScanReverse(&r, x);
|
|
return (31 - r);
|
|
}
|
|
|
|
#endif
|
|
|
|
__attribute__((const)) inline bool divisibleby(size_t a, uint32_t x) {
|
|
return (a % x == 0);
|
|
}
|
|
|
|
/**
|
|
* compute the deltas, you do not want to use this
|
|
* function if speed matters. This is only for convenience.
|
|
*/
|
|
template <class container>
|
|
container diffs(const container &in, const bool aredistinct) {
|
|
container out;
|
|
if (in.empty())
|
|
return out;
|
|
out.resize(in.size() - 1);
|
|
for (size_t k = 0; k < in.size() - 1; ++k)
|
|
if (aredistinct)
|
|
out.push_back(in[k + 1] - in[k] - 1);
|
|
else
|
|
out.push_back(in[k + 1] - in[k]);
|
|
return out;
|
|
}
|
|
|
|
inline void checkifdivisibleby(size_t a, uint32_t x) {
|
|
if (!divisibleby(a, x)) {
|
|
std::ostringstream convert;
|
|
convert << a << " not divisible by " << x;
|
|
throw std::logic_error(convert.str());
|
|
}
|
|
}
|
|
|
|
template <class iter> void printme(iter i, iter b) {
|
|
for (iter j = i; j != b; ++j)
|
|
std::cout << *j << " ";
|
|
std::cout << std::endl;
|
|
}
|
|
|
|
__attribute__((const)) inline uint32_t asmbits(const uint32_t v) {
|
|
#ifdef _MSC_VER
|
|
return gccbits(v);
|
|
#else
|
|
if (v == 0)
|
|
return 0;
|
|
uint32_t answer;
|
|
__asm__("bsr %1, %0;" : "=r"(answer) : "r"(v));
|
|
return answer + 1;
|
|
#endif
|
|
}
|
|
|
|
__attribute__((const)) inline uint32_t slowbits(uint32_t v) {
|
|
uint32_t r = 0;
|
|
while (v) {
|
|
r++;
|
|
v = v >> 1;
|
|
}
|
|
return r;
|
|
}
|
|
|
|
__attribute__((const)) inline uint32_t bits(uint32_t v) {
|
|
uint32_t r(0);
|
|
if (v >= (1U << 15)) {
|
|
v >>= 16;
|
|
r += 16;
|
|
}
|
|
if (v >= (1U << 7)) {
|
|
v >>= 8;
|
|
r += 8;
|
|
}
|
|
if (v >= (1U << 3)) {
|
|
v >>= 4;
|
|
r += 4;
|
|
}
|
|
if (v >= (1U << 1)) {
|
|
v >>= 2;
|
|
r += 2;
|
|
}
|
|
if (v >= (1U << 0)) {
|
|
v >>= 1;
|
|
r += 1;
|
|
}
|
|
return r;
|
|
}
|
|
|
|
#ifndef _MSC_VER
|
|
__attribute__((const)) constexpr uint32_t constexprbits(uint32_t v) {
|
|
return v >= (1U << 15)
|
|
? 16 + constexprbits(v >> 16)
|
|
: (v >= (1U << 7))
|
|
? 8 + constexprbits(v >> 8)
|
|
: (v >= (1U << 3))
|
|
? 4 + constexprbits(v >> 4)
|
|
: (v >= (1U << 1))
|
|
? 2 + constexprbits(v >> 2)
|
|
: (v >= (1U << 0)) ? 1 + constexprbits(v >> 1)
|
|
: 0;
|
|
}
|
|
#else
|
|
|
|
template <int N> struct exprbits {
|
|
enum { value = 1 + exprbits<(N >> 1)>::value };
|
|
};
|
|
|
|
template <> struct exprbits<0> {
|
|
enum { value = 0 };
|
|
};
|
|
|
|
#define constexprbits(n) exprbits<n>::value
|
|
|
|
#endif
|
|
|
|
constexpr uint32_t div_roundup(uint32_t v, uint32_t divisor) {
|
|
return (v + (divisor - 1)) / divisor;
|
|
}
|
|
|
|
template <class iterator>
|
|
__attribute__((pure)) uint32_t maxbits(const iterator &begin,
|
|
const iterator &end) {
|
|
uint32_t accumulator = 0;
|
|
for (iterator k = begin; k != end; ++k) {
|
|
accumulator |= *k;
|
|
}
|
|
return gccbits(accumulator);
|
|
}
|
|
|
|
template <class iterator>
|
|
uint32_t slowmaxbits(const iterator &begin, const iterator &end) {
|
|
uint32_t accumulator = 0;
|
|
for (iterator k = begin; k != end; ++k) {
|
|
const uint32_t tb = gccbits(*k);
|
|
if (tb > accumulator)
|
|
accumulator = tb;
|
|
}
|
|
return accumulator;
|
|
}
|
|
|
|
// basically, we can sometimes memoize the maxbits computation
|
|
// Since the first scan looks at b input words, the second looks
|
|
// at b/2, the third looks at b/3... (total related to harmonic numbers)
|
|
// it is probably only worthwhile to memoize the first maybe 20% prefix
|
|
// (rest can be "naively" re-scanned if needed)
|
|
// also, a useful heuristic should be to start with however many
|
|
// bits are required for the first number in the sequence. Or OR
|
|
// the first two or three values together (danger, what if you OR
|
|
// more than you'd actually use?)
|
|
// alternative heuristic is to start with however many bits you used for the
|
|
// last encoding. See if it works. Yes: start sequential scan downward. No:
|
|
// start sequential scan upward.
|
|
// To be tried...
|
|
|
|
// template<class t>
|
|
// struct bitwise_or : public binary_function<t, t, t> {
|
|
// t operator()(t x, t y) { return x|y; }
|
|
//};
|
|
|
|
//
|
|
template <int b, class t, class iterator>
|
|
int greedy_bit_size_lookahead(const iterator &begin, const iterator &end) {
|
|
// assert(end- begin <= b);
|
|
std::vector<t> prefixOrBuffer(end -
|
|
begin); // consider a preallocated buffer...
|
|
|
|
partial_sum(begin, end, prefixOrBuffer.begin(),
|
|
[](t x, t y) { return x | y; } // change dl's + to |
|
|
// bitwise_or<t>()
|
|
);
|
|
// do the bitwise or-ing once only.
|
|
if (end - begin ==
|
|
b) { // expected case, to help out compiler. Should be unrolled
|
|
for (int i = 1; i < 31; ++i)
|
|
if (prefixOrBuffer[b / i - 1] < (static_cast<t>(1) << i))
|
|
return i;
|
|
// assert(false); // cannot get here unless 32+ bits required
|
|
return -1;
|
|
} else { // general case, maybe less data than we could pack with 1-bit fields
|
|
for (int i = 1; i < 31; ++i) {
|
|
uint64_t indexToCheck = b / i - 1;
|
|
if (indexToCheck >= prefixOrBuffer.size())
|
|
indexToCheck = prefixOrBuffer.size() - 1;
|
|
|
|
if (prefixOrBuffer[indexToCheck] < (static_cast<t>(1) << i))
|
|
return i;
|
|
}
|
|
// assert(false);
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
// assume the previous bit size is close to the required bit size
|
|
template <int b, class t, class iterator>
|
|
int greedy_bit_size_lookahead(const iterator &begin, const iterator &end,
|
|
uint32_t previous_size) {
|
|
|
|
uint32_t span_length = end - begin;
|
|
if (span_length == b) { // work on the specialization later...
|
|
// try previous size
|
|
if (maxbits(begin, begin + (b / previous_size)) > previous_size) {
|
|
// previous_size is too small; go until you find something bigger that
|
|
// works
|
|
for (uint32_t i = previous_size + 1;
|
|
i < previous_size + 32 /* was nothing */;
|
|
++i) // upper bound is only to encourage compiler to unroll
|
|
if (maxbits(begin, begin + (b / i)) <= i)
|
|
return i;
|
|
return -1; // impossible
|
|
} else { // previous_size works, but perhaps we can find something smaller
|
|
// that also works
|
|
uint32_t i;
|
|
for (i = previous_size - 1; i /* > 0 */ != previous_size - 32; --i) {
|
|
if (i == 0)
|
|
break; // This funkiness is to encourage unrolling.
|
|
if (maxbits(begin, begin + (b / i)) > i)
|
|
break;
|
|
}
|
|
return i + 1; // either i=0 and we return 1....or i is the first too-small
|
|
// size
|
|
}
|
|
} else {
|
|
// same thing with careful checks to avoid reading past end of buffer
|
|
uint32_t endIdx = b / previous_size;
|
|
if (endIdx >= span_length)
|
|
endIdx = span_length;
|
|
|
|
if (maxbits(begin, begin + endIdx) > previous_size) {
|
|
for (uint32_t i = previous_size + 1;; ++i) {
|
|
endIdx = b / i;
|
|
if (endIdx >= span_length)
|
|
endIdx = span_length;
|
|
if (maxbits(begin, begin + endIdx) <= i)
|
|
return i;
|
|
}
|
|
return -1; // impossible
|
|
} else {
|
|
uint32_t i;
|
|
for (i = previous_size - 1; i > 0; --i) {
|
|
endIdx = b / i;
|
|
if (endIdx >= span_length)
|
|
endIdx = span_length;
|
|
if (maxbits(begin, begin + endIdx) > i)
|
|
break;
|
|
}
|
|
return i + 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
class BitWidthHistoGram {
|
|
public:
|
|
std::vector<double> histo;
|
|
BitWidthHistoGram() : histo(33, 0) {}
|
|
|
|
void display(std::string prefix = "") {
|
|
double sum = 0;
|
|
for (size_t k = 0; k < histo.size(); ++k)
|
|
sum += histo[k];
|
|
if (sum == 0)
|
|
return;
|
|
for (size_t k = 0; k < histo.size(); ++k) {
|
|
std::cout << prefix << k << " " << histo[k] / sum << std::endl;
|
|
}
|
|
}
|
|
template <class container> void eatIntegers(const container &rawdata) {
|
|
for (uint32_t i = 0; i < rawdata.size(); ++i) {
|
|
histo[asmbits(rawdata[i])] += 1;
|
|
}
|
|
}
|
|
|
|
template <class container> void eatDGaps(const container &rawdata) {
|
|
if (rawdata.size() <= 1)
|
|
return;
|
|
for (uint32_t i = 0; i < rawdata.size() - 1; ++i) {
|
|
assert(rawdata[i + 1] > rawdata[i]);
|
|
uint32_t gap = rawdata[i + 1] - rawdata[i] - 1;
|
|
assert(gap < rawdata[i + 1]);
|
|
histo[asmbits(gap)] += 1;
|
|
}
|
|
}
|
|
};
|
|
|
|
} // namespace FastPFor
|
|
|
|
#endif
|