/** * This code is released under the * Apache License Version 2.0 http://www.apache.org/licenses/. * * (c) Daniel Lemire, http://lemire.me/en/ * and Owen Kaser */ #ifndef UTIL #define UTIL #include "common.h" #ifdef __linux__ #define USE_O_DIRECT #endif namespace FastPForLib { //#define STATS // taken from stackoverflow #ifndef NDEBUG #define ASSERT(condition, message) \ do { \ if (!(condition)) { \ std::cerr << "Assertion `" #condition "` failed in " << __FILE__ \ << " line " << __LINE__ << ": " << message << std::endl; \ std::exit(EXIT_FAILURE); \ } \ } while (false) #else #define ASSERT(condition, message) \ do { \ } while (false) #endif /** * Computes the greatest common divisor */ constexpr __attribute__((const)) uint32_t gcd(uint32_t x, uint32_t y) { return (x % y) == 0 ? y : gcd(y, x % y); } template __attribute__((const)) T *padTo32bits(T *inbyte) { return reinterpret_cast((reinterpret_cast(inbyte) + 3) & ~3); } template __attribute__((const)) const T *padTo32bits(const T *inbyte) { return reinterpret_cast((reinterpret_cast(inbyte) + 3) & ~3); } template __attribute__((const)) T *padTo64bits(T *inbyte) { return reinterpret_cast((reinterpret_cast(inbyte) + 7) & ~7); } template __attribute__((const)) const T *padTo64bits(const T *inbyte) { return reinterpret_cast((reinterpret_cast(inbyte) + 7) & ~7); } template __attribute__((const)) T *padTo128bits(T *inbyte) { return reinterpret_cast((reinterpret_cast(inbyte) + 15) & ~15); } template __attribute__((const)) const T *padTo128bits(const T *inbyte) { return reinterpret_cast( (reinterpret_cast(inbyte) + 15) & ~15); } template __attribute__((const)) T *padTo64bytes(T *inbyte) { return reinterpret_cast((reinterpret_cast(inbyte) + 63) & ~63); } template __attribute__((const)) const T *padTo64bytes(const T *inbyte) { return reinterpret_cast((reinterpret_cast(inbyte) + 63) & ~63); } template __attribute__((const)) bool needPaddingTo32Bits(const T *inbyte) { return (reinterpret_cast(inbyte) & 3) != 0; } template __attribute__((const)) bool needPaddingTo64Bits(const T *inbyte) { return (reinterpret_cast(inbyte) & 7) != 0; } template __attribute__((const)) bool needPaddingTo128Bits(const T *inbyte) { return (reinterpret_cast(inbyte) & 15) != 0; } template bool needPaddingTo64bytes(const T *inbyte) { return (reinterpret_cast(inbyte) & 63) != 0; } __attribute__((const)) inline uint32_t gccbits(const uint32_t v) { #ifdef _MSC_VER if (v == 0) { return 0; } unsigned long answer; _BitScanReverse(&answer, v); return answer + 1; #else return v == 0 ? 0 : 32 - __builtin_clz(v); #endif } #ifdef _MSC_VER // taken from // http://stackoverflow.com/questions/355967/how-to-use-msvc-intrinsics-to-get-the-equivalent-of-this-gcc-code uint32_t __builtin_clz(uint32_t x) { unsigned long r = 0; _BitScanReverse(&r, x); return (31 - r); } #endif __attribute__((const)) inline bool divisibleby(size_t a, uint32_t x) { return (a % x == 0); } /** * compute the deltas, you do not want to use this * function if speed matters. This is only for convenience. */ template container diffs(const container &in, const bool aredistinct) { container out; if (in.empty()) return out; out.resize(in.size() - 1); for (size_t k = 0; k < in.size() - 1; ++k) if (aredistinct) out.push_back(in[k + 1] - in[k] - 1); else out.push_back(in[k + 1] - in[k]); return out; } inline void checkifdivisibleby(size_t a, uint32_t x) { if (!divisibleby(a, x)) { std::ostringstream convert; convert << a << " not divisible by " << x; throw std::logic_error(convert.str()); } } template void printme(iter i, iter b) { for (iter j = i; j != b; ++j) std::cout << *j << " "; std::cout << std::endl; } __attribute__((const)) inline uint32_t asmbits(const uint32_t v) { #ifdef _MSC_VER return gccbits(v); #else if (v == 0) return 0; uint32_t answer; __asm__("bsr %1, %0;" : "=r"(answer) : "r"(v)); return answer + 1; #endif } __attribute__((const)) inline uint32_t slowbits(uint32_t v) { uint32_t r = 0; while (v) { r++; v = v >> 1; } return r; } __attribute__((const)) inline uint32_t bits(uint32_t v) { uint32_t r(0); if (v >= (1U << 15)) { v >>= 16; r += 16; } if (v >= (1U << 7)) { v >>= 8; r += 8; } if (v >= (1U << 3)) { v >>= 4; r += 4; } if (v >= (1U << 1)) { v >>= 2; r += 2; } if (v >= (1U << 0)) { v >>= 1; r += 1; } return r; } #ifndef _MSC_VER __attribute__((const)) constexpr uint32_t constexprbits(uint32_t v) { return v >= (1U << 15) ? 16 + constexprbits(v >> 16) : (v >= (1U << 7)) ? 8 + constexprbits(v >> 8) : (v >= (1U << 3)) ? 4 + constexprbits(v >> 4) : (v >= (1U << 1)) ? 2 + constexprbits(v >> 2) : (v >= (1U << 0)) ? 1 + constexprbits(v >> 1) : 0; } #else template struct exprbits { enum { value = 1 + exprbits<(N >> 1)>::value }; }; template <> struct exprbits<0> { enum { value = 0 }; }; #define constexprbits(n) exprbits::value #endif constexpr uint32_t div_roundup(uint32_t v, uint32_t divisor) { return (v + (divisor - 1)) / divisor; } template __attribute__((pure)) uint32_t maxbits(const iterator &begin, const iterator &end) { uint32_t accumulator = 0; for (iterator k = begin; k != end; ++k) { accumulator |= *k; } return gccbits(accumulator); } template uint32_t slowmaxbits(const iterator &begin, const iterator &end) { uint32_t accumulator = 0; for (iterator k = begin; k != end; ++k) { const uint32_t tb = gccbits(*k); if (tb > accumulator) accumulator = tb; } return accumulator; } // basically, we can sometimes memoize the maxbits computation // Since the first scan looks at b input words, the second looks // at b/2, the third looks at b/3... (total related to harmonic numbers) // it is probably only worthwhile to memoize the first maybe 20% prefix // (rest can be "naively" re-scanned if needed) // also, a useful heuristic should be to start with however many // bits are required for the first number in the sequence. Or OR // the first two or three values together (danger, what if you OR // more than you'd actually use?) // alternative heuristic is to start with however many bits you used for the // last encoding. See if it works. Yes: start sequential scan downward. No: // start sequential scan upward. // To be tried... // template // struct bitwise_or : public binary_function { // t operator()(t x, t y) { return x|y; } //}; // template int greedy_bit_size_lookahead(const iterator &begin, const iterator &end) { // assert(end- begin <= b); std::vector prefixOrBuffer(end - begin); // consider a preallocated buffer... partial_sum(begin, end, prefixOrBuffer.begin(), [](t x, t y) { return x | y; } // change dl's + to | // bitwise_or() ); // do the bitwise or-ing once only. if (end - begin == b) { // expected case, to help out compiler. Should be unrolled for (int i = 1; i < 31; ++i) if (prefixOrBuffer[b / i - 1] < (static_cast(1) << i)) return i; // assert(false); // cannot get here unless 32+ bits required return -1; } else { // general case, maybe less data than we could pack with 1-bit fields for (int i = 1; i < 31; ++i) { uint64_t indexToCheck = b / i - 1; if (indexToCheck >= prefixOrBuffer.size()) indexToCheck = prefixOrBuffer.size() - 1; if (prefixOrBuffer[indexToCheck] < (static_cast(1) << i)) return i; } // assert(false); return -1; } } // assume the previous bit size is close to the required bit size template int greedy_bit_size_lookahead(const iterator &begin, const iterator &end, uint32_t previous_size) { uint32_t span_length = end - begin; if (span_length == b) { // work on the specialization later... // try previous size if (maxbits(begin, begin + (b / previous_size)) > previous_size) { // previous_size is too small; go until you find something bigger that // works for (uint32_t i = previous_size + 1; i < previous_size + 32 /* was nothing */; ++i) // upper bound is only to encourage compiler to unroll if (maxbits(begin, begin + (b / i)) <= i) return i; return -1; // impossible } else { // previous_size works, but perhaps we can find something smaller // that also works uint32_t i; for (i = previous_size - 1; i /* > 0 */ != previous_size - 32; --i) { if (i == 0) break; // This funkiness is to encourage unrolling. if (maxbits(begin, begin + (b / i)) > i) break; } return i + 1; // either i=0 and we return 1....or i is the first too-small // size } } else { // same thing with careful checks to avoid reading past end of buffer uint32_t endIdx = b / previous_size; if (endIdx >= span_length) endIdx = span_length; if (maxbits(begin, begin + endIdx) > previous_size) { for (uint32_t i = previous_size + 1;; ++i) { endIdx = b / i; if (endIdx >= span_length) endIdx = span_length; if (maxbits(begin, begin + endIdx) <= i) return i; } return -1; // impossible } else { uint32_t i; for (i = previous_size - 1; i > 0; --i) { endIdx = b / i; if (endIdx >= span_length) endIdx = span_length; if (maxbits(begin, begin + endIdx) > i) break; } return i + 1; } } } class BitWidthHistoGram { public: std::vector histo; BitWidthHistoGram() : histo(33, 0) {} void display(std::string prefix = "") { double sum = 0; for (size_t k = 0; k < histo.size(); ++k) sum += histo[k]; if (sum == 0) return; for (size_t k = 0; k < histo.size(); ++k) { std::cout << prefix << k << " " << histo[k] / sum << std::endl; } } template void eatIntegers(const container &rawdata) { for (uint32_t i = 0; i < rawdata.size(); ++i) { histo[asmbits(rawdata[i])] += 1; } } template void eatDGaps(const container &rawdata) { if (rawdata.size() <= 1) return; for (uint32_t i = 0; i < rawdata.size() - 1; ++i) { assert(rawdata[i + 1] > rawdata[i]); uint32_t gap = rawdata[i + 1] - rawdata[i] - 1; assert(gap < rawdata[i + 1]); histo[asmbits(gap)] += 1; } } }; } // namespace FastPFor #endif