TurboPFor-Integer-Compression/ext/bench_/bench/util.h

/**
 * This code is released under the
 * Apache License Version 2.0 http://www.apache.org/licenses/.
 *
 * (c) Daniel Lemire, http://lemire.me/en/
 * and Owen Kaser
 */

#ifndef UTIL
#define UTIL
#include "common.h"

#ifdef __linux__
#define USE_O_DIRECT
#endif

namespace FastPForLib {

//#define STATS
// taken from stackoverflow
#ifndef NDEBUG
#define ASSERT(condition, message)                                             \
  do {                                                                         \
    if (!(condition)) {                                                        \
      std::cerr << "Assertion `" #condition "` failed in " << __FILE__         \
                << " line " << __LINE__ << ": " << message << std::endl;       \
      std::exit(EXIT_FAILURE);                                                 \
    }                                                                          \
  } while (false)
#else
#define ASSERT(condition, message)                                             \
  do {                                                                         \
  } while (false)
#endif

/**
 * Computes the greatest common divisor
 */
constexpr __attribute__((const)) uint32_t gcd(uint32_t x, uint32_t y) {
  return (x % y) == 0 ? y : gcd(y, x % y);
}

template <class T> __attribute__((const)) T *padTo32bits(T *inbyte) {
  return reinterpret_cast<T *>((reinterpret_cast<uintptr_t>(inbyte) + 3) & ~3);
}

template <class T>
__attribute__((const)) const T *padTo32bits(const T *inbyte) {
  return reinterpret_cast<const T *>((reinterpret_cast<uintptr_t>(inbyte) + 3) &
                                     ~3);
}

template <class T> __attribute__((const)) T *padTo64bits(T *inbyte) {
  return reinterpret_cast<T *>((reinterpret_cast<uintptr_t>(inbyte) + 7) & ~7);
}

template <class T>
__attribute__((const)) const T *padTo64bits(const T *inbyte) {
  return reinterpret_cast<const T *>((reinterpret_cast<uintptr_t>(inbyte) + 7) &
                                     ~7);
}

template <class T> __attribute__((const)) T *padTo128bits(T *inbyte) {
  return reinterpret_cast<T *>((reinterpret_cast<uintptr_t>(inbyte) + 15) &
                               ~15);
}

template <class T>
__attribute__((const)) const T *padTo128bits(const T *inbyte) {
  return reinterpret_cast<const T *>(
      (reinterpret_cast<uintptr_t>(inbyte) + 15) & ~15);
}

template <class T> __attribute__((const)) T *padTo64bytes(T *inbyte) {
  return reinterpret_cast<T *>((reinterpret_cast<uintptr_t>(inbyte) + 63) &
                               ~63);
}

template <class T>
__attribute__((const)) const T *padTo64bytes(const T *inbyte) {
  return reinterpret_cast<T *>((reinterpret_cast<uintptr_t>(inbyte) + 63) &
                               ~63);
}

template <class T>
__attribute__((const)) bool needPaddingTo32Bits(const T *inbyte) {
  return (reinterpret_cast<uintptr_t>(inbyte) & 3) != 0;
}

template <class T>
__attribute__((const)) bool needPaddingTo64Bits(const T *inbyte) {
  return (reinterpret_cast<uintptr_t>(inbyte) & 7) != 0;
}

template <class T>
__attribute__((const)) bool needPaddingTo128Bits(const T *inbyte) {
  return (reinterpret_cast<uintptr_t>(inbyte) & 15) != 0;
}

template <class T> bool needPaddingTo64bytes(const T *inbyte) {
  return (reinterpret_cast<uintptr_t>(inbyte) & 63) != 0;
}

__attribute__((const)) inline uint32_t gccbits(const uint32_t v) {
#ifdef _MSC_VER
  if (v == 0) {
    return 0;
  }
  unsigned long answer;
  _BitScanReverse(&answer, v);
  return answer + 1;
#else
  return v == 0 ? 0 : 32 - __builtin_clz(v);
#endif
}

#ifdef _MSC_VER
// taken from
// http://stackoverflow.com/questions/355967/how-to-use-msvc-intrinsics-to-get-the-equivalent-of-this-gcc-code
uint32_t __builtin_clz(uint32_t x) {
  unsigned long r = 0;
  _BitScanReverse(&r, x);
  return (31 - r);
}

#endif

__attribute__((const)) inline bool divisibleby(size_t a, uint32_t x) {
  return (a % x == 0);
}

/**
 * compute the deltas, you do not want to use this
 * function if speed matters. This is only for convenience.
 */
template <class container>
container diffs(const container &in, const bool aredistinct) {
  container out;
  if (in.empty())
    return out;
  out.resize(in.size() - 1);
  for (size_t k = 0; k < in.size() - 1; ++k)
    if (aredistinct)
      out.push_back(in[k + 1] - in[k] - 1);
    else
      out.push_back(in[k + 1] - in[k]);
  return out;
}

inline void checkifdivisibleby(size_t a, uint32_t x) {
  if (!divisibleby(a, x)) {
    std::ostringstream convert;
    convert << a << " not divisible by " << x;
    throw std::logic_error(convert.str());
  }
}

template <class iter> void printme(iter i, iter b) {
  for (iter j = i; j != b; ++j)
    std::cout << *j << " ";
  std::cout << std::endl;
}

__attribute__((const)) inline uint32_t asmbits(const uint32_t v) {
#ifdef _MSC_VER
  return gccbits(v);
#else
  if (v == 0)
    return 0;
  uint32_t answer;
  __asm__("bsr %1, %0;" : "=r"(answer) : "r"(v));
  return answer + 1;
#endif
}

__attribute__((const)) inline uint32_t slowbits(uint32_t v) {
  uint32_t r = 0;
  while (v) {
    r++;
    v = v >> 1;
  }
  return r;
}

__attribute__((const)) inline uint32_t bits(uint32_t v) {
  uint32_t r(0);
  if (v >= (1U << 15)) {
    v >>= 16;
    r += 16;
  }
  if (v >= (1U << 7)) {
    v >>= 8;
    r += 8;
  }
  if (v >= (1U << 3)) {
    v >>= 4;
    r += 4;
  }
  if (v >= (1U << 1)) {
    v >>= 2;
    r += 2;
  }
  if (v >= (1U << 0)) {
    v >>= 1;
    r += 1;
  }
  return r;
}

#ifndef _MSC_VER
__attribute__((const)) constexpr uint32_t constexprbits(uint32_t v) {
  return v >= (1U << 15)
             ? 16 + constexprbits(v >> 16)
             : (v >= (1U << 7))
                   ? 8 + constexprbits(v >> 8)
                   : (v >= (1U << 3))
                         ? 4 + constexprbits(v >> 4)
                         : (v >= (1U << 1))
                               ? 2 + constexprbits(v >> 2)
                               : (v >= (1U << 0)) ? 1 + constexprbits(v >> 1)
                                                  : 0;
}
#else

template <int N> struct exprbits {
  enum { value = 1 + exprbits<(N >> 1)>::value };
};

template <> struct exprbits<0> {
  enum { value = 0 };
};

#define constexprbits(n) exprbits<n>::value

#endif

constexpr uint32_t div_roundup(uint32_t v, uint32_t divisor) {
  return (v + (divisor - 1)) / divisor;
}

template <class iterator>
__attribute__((pure)) uint32_t maxbits(const iterator &begin,
                                       const iterator &end) {
  uint32_t accumulator = 0;
  for (iterator k = begin; k != end; ++k) {
    accumulator |= *k;
  }
  return gccbits(accumulator);
}

template <class iterator>
uint32_t slowmaxbits(const iterator &begin, const iterator &end) {
  uint32_t accumulator = 0;
  for (iterator k = begin; k != end; ++k) {
    const uint32_t tb = gccbits(*k);
    if (tb > accumulator)
      accumulator = tb;
  }
  return accumulator;
}

// basically, we can sometimes memoize the maxbits computation
// Since the first scan looks at b input words, the second looks
// at b/2, the third looks at b/3... (total related to harmonic numbers)
// it is probably only worthwhile to memoize the first maybe 20% prefix
// (rest can be "naively"  re-scanned if needed)
// also, a useful heuristic should be to start with however many
// bits are required for the first number in the sequence.  Or OR
// the first two or three values together (danger, what if you OR
// more than you'd actually use?)
// alternative heuristic is to start with however many bits you used for the
// last encoding.  See if it works.  Yes: start sequential scan downward.  No:
// start sequential scan upward.
// To be tried...

// template<class t>
// struct bitwise_or : public binary_function<t, t, t> {
// t operator()(t x, t y) { return x|y; }
//};

//
template <int b, class t, class iterator>
int greedy_bit_size_lookahead(const iterator &begin, const iterator &end) {
  //  assert(end- begin <= b);
  std::vector<t> prefixOrBuffer(end -
                                begin); // consider a  preallocated buffer...

  partial_sum(begin, end, prefixOrBuffer.begin(),
              [](t x, t y) { return x | y; } // change dl's + to |
              // bitwise_or<t>()
              );
  // do the bitwise or-ing once only.
  if (end - begin ==
      b) { // expected case, to help out compiler.  Should be unrolled
    for (int i = 1; i < 31; ++i)
      if (prefixOrBuffer[b / i - 1] < (static_cast<t>(1) << i))
        return i;
    // assert(false); // cannot get here unless 32+ bits required
    return -1;
  } else { // general case, maybe less data than we could pack with 1-bit fields
    for (int i = 1; i < 31; ++i) {
      uint64_t indexToCheck = b / i - 1;
      if (indexToCheck >= prefixOrBuffer.size())
        indexToCheck = prefixOrBuffer.size() - 1;

      if (prefixOrBuffer[indexToCheck] < (static_cast<t>(1) << i))
        return i;
    }
    // assert(false);
    return -1;
  }
}

// assume the previous bit size is close to the required bit size
template <int b, class t, class iterator>
int greedy_bit_size_lookahead(const iterator &begin, const iterator &end,
                              uint32_t previous_size) {

  uint32_t span_length = end - begin;
  if (span_length == b) { // work on the specialization later...
    // try previous size
    if (maxbits(begin, begin + (b / previous_size)) > previous_size) {
      // previous_size is too small; go until you find something bigger that
      // works
      for (uint32_t i = previous_size + 1;
           i < previous_size + 32 /* was nothing */;
           ++i) // upper bound is only to encourage compiler to unroll
        if (maxbits(begin, begin + (b / i)) <= i)
          return i;
      return -1; // impossible
    } else { // previous_size works, but perhaps we can find something smaller
             // that also works
      uint32_t i;
      for (i = previous_size - 1; i /* > 0 */ != previous_size - 32; --i) {
        if (i == 0)
          break; // This funkiness is to encourage unrolling.
        if (maxbits(begin, begin + (b / i)) > i)
          break;
      }
      return i + 1; // either i=0 and we return 1....or i is the first too-small
                    // size
    }
  } else {
    // same thing with careful checks to avoid reading past end of buffer
    uint32_t endIdx = b / previous_size;
    if (endIdx >= span_length)
      endIdx = span_length;

    if (maxbits(begin, begin + endIdx) > previous_size) {
      for (uint32_t i = previous_size + 1;; ++i) {
        endIdx = b / i;
        if (endIdx >= span_length)
          endIdx = span_length;
        if (maxbits(begin, begin + endIdx) <= i)
          return i;
      }
      return -1; // impossible
    } else {
      uint32_t i;
      for (i = previous_size - 1; i > 0; --i) {
        endIdx = b / i;
        if (endIdx >= span_length)
          endIdx = span_length;
        if (maxbits(begin, begin + endIdx) > i)
          break;
      }
      return i + 1;
    }
  }
}

class BitWidthHistoGram {
public:
  std::vector<double> histo;
  BitWidthHistoGram() : histo(33, 0) {}

  void display(std::string prefix = "") {
    double sum = 0;
    for (size_t k = 0; k < histo.size(); ++k)
      sum += histo[k];
    if (sum == 0)
      return;
    for (size_t k = 0; k < histo.size(); ++k) {
      std::cout << prefix << k << " " << histo[k] / sum << std::endl;
    }
  }
  template <class container> void eatIntegers(const container &rawdata) {
    for (uint32_t i = 0; i < rawdata.size(); ++i) {
      histo[asmbits(rawdata[i])] += 1;
    }
  }

  template <class container> void eatDGaps(const container &rawdata) {
    if (rawdata.size() <= 1)
      return;
    for (uint32_t i = 0; i < rawdata.size() - 1; ++i) {
      assert(rawdata[i + 1] > rawdata[i]);
      uint32_t gap = rawdata[i + 1] - rawdata[i] - 1;
      assert(gap < rawdata[i + 1]);
      histo[asmbits(gap)] += 1;
    }
  }
};

} // namespace FastPFor

#endif