Files
TurboPFor-Integer-Compression/ext/bench_/bench/codecs.h
2017-01-02 23:30:16 +01:00

173 lines
5.6 KiB
C++

/**
* This code is released under the
* Apache License Version 2.0 http://www.apache.org/licenses/.
*
* (c) Daniel Lemire, http://lemire.me/en/
*/
#ifndef CODECS_H_
#define CODECS_H_
#include "common.h"
#include "util.h"
#include "bitpackinghelpers.h"
namespace FastPForLib {
class NotEnoughStorage : public std::runtime_error {
public:
size_t required; // number of 32-bit symbols required
NotEnoughStorage(const size_t req)
: runtime_error(""), required(req){
};
};
class IntegerCODEC {
public:
/**
* You specify input and input length, as well as
* output and output length. nvalue gets modified to
* reflect how much was used. If the new value of
* nvalue is more than the original value, we can
* consider this a buffer overrun.
*
* You are responsible for allocating the memory (length
* for *in and nvalue for *out).
*/
virtual void encodeArray(const uint32_t *in, const size_t length,
uint32_t *out, size_t &nvalue) = 0;
/**
* Usage is similar to decodeArray except that it returns a pointer
* incremented from in. In theory it should be in+length. If the
* returned pointer is less than in+length, then this generally means
* that the decompression is not finished (some scheme compress
* the bulk of the data one way, and they then they compress remaining
* integers using another scheme).
*
* As with encodeArray, you need to have length element allocated
* for *in and at least nvalue elements allocated for out. The value
* of the variable nvalue gets updated with the number actually use
* (if nvalue exceeds the original value, there might be a buffer
* overrun).
*/
virtual const uint32_t *decodeArray(const uint32_t *in, const size_t length,
uint32_t *out, size_t &nvalue) = 0;
virtual ~IntegerCODEC() {}
/**
* Will compress the content of a vector into
* another vector.
*
* This is offered for convenience. It might be slow.
*/
virtual std::vector<uint32_t> compress(const std::vector<uint32_t> &data) {
std::vector<uint32_t> compresseddata(data.size() * 2 +
1024); // allocate plenty of memory
size_t memavailable = compresseddata.size();
encodeArray(&data[0], data.size(), &compresseddata[0], memavailable);
compresseddata.resize(memavailable);
return compresseddata;
}
/**
* Will uncompress the content of a vector into
* another vector. Some CODECs know exactly how much data to uncompress,
* others need to uncompress it all to know how data there is to uncompress...
* So it useful to have a hint (expected_uncompressed_size) that tells how
* much data there will be to uncompress. Otherwise, the code will
* try to guess, but the result is uncertain and inefficient. You really
* ought to keep track of how many symbols you had compressed.
*
* For convenience. Might be slow.
*/
virtual std::vector<uint32_t>
uncompress(const std::vector<uint32_t> &compresseddata,
size_t expected_uncompressed_size = 0) {
std::vector<uint32_t> data(
expected_uncompressed_size); // allocate plenty of memory
size_t memavailable = data.size();
try {
decodeArray(&compresseddata[0], compresseddata.size(), &data[0],
memavailable);
} catch (NotEnoughStorage &nes) {
data.resize(nes.required + 1024);
decodeArray(&compresseddata[0], compresseddata.size(), &data[0],
memavailable);
}
data.resize(memavailable);
return data;
}
virtual std::string name() const = 0;
};
/******************
* This just copies the data, no compression.
*/
class JustCopy : public IntegerCODEC {
public:
void encodeArray(const uint32_t *in, const size_t length, uint32_t *out,
size_t &nvalue) {
memcpy(out, in, sizeof(uint32_t) * length);
nvalue = length;
}
// like encodeArray, but we don't actually copy
void fakeencodeArray(const uint32_t * /*in*/, const size_t length,
size_t &nvalue) {
nvalue = length;
}
const uint32_t *decodeArray(const uint32_t *in, const size_t length,
uint32_t *out, size_t &nvalue) {
memcpy(out, in, sizeof(uint32_t) * length);
nvalue = length;
return in + length;
}
std::string name() const { return "JustCopy"; }
};
/********
* This uses a single bit width for the whole array.
* It has fast decompression and random access, but
* relatively poor compression. Included as an example.
*/
class PackedCODEC : public IntegerCODEC {
public:
enum { BlockSize = 32 };
void encodeArray(const uint32_t *in, const size_t length, uint32_t *out,
size_t &nvalue) {
checkifdivisibleby(length, 32);
const uint32_t b = maxbits(in, in + length);
out[0] = static_cast<uint32_t>(length);
out[1] = b;
out += 2;
for (uint32_t run = 0; run < length / 32; ++run, in += 32, out += b) {
fastpackwithoutmask(in, out, b);
}
nvalue = 2 + length * b / 32;
}
#ifndef NDEBUG
const uint32_t *decodeArray(const uint32_t *in, const size_t length,
#else
const uint32_t *decodeArray(const uint32_t *in, const size_t /*length*/,
#endif
uint32_t *out, size_t &nvalue) {
nvalue = in[0];
const uint32_t b = in[1];
assert(length >= nvalue * b / 32);
in += 2;
for (uint32_t run = 0; run < nvalue / 32; ++run, in += b, out += 32) {
fastunpack(in, out, b);
}
return in;
}
std::string name() const { return "PackedCODEC"; }
};
} // namespace FastPFor
#endif /* CODECS_H_ */