New functions + Integer Intersections
This commit is contained in:
73
README.md
73
README.md
@ -5,11 +5,12 @@ TurboPFor: Fastest Integer Compression [ scenarios
|
||||
- Bit Packing with **Direct/Random Access** without decompressing entire blocks
|
||||
- Access any single bit packed entry with **zero decompression**
|
||||
- **New:** **Direct Update** of individual bit packed entries
|
||||
- Reducing **Cache Pollution**
|
||||
<p>
|
||||
- Novel **"TurboPFor"** (Patched Frame-of-Reference) scheme with direct access or bulk decoding.
|
||||
@ -26,41 +27,60 @@ TurboPFor: Fastest Integer Compression [<br />
|
||||
|
||||
*./icbench -c1 -n10000000000 clueweb09.sorted*
|
||||
*./icbench -a1.5 -m0 -M8 -n100000000*
|
||||
|
||||
<table>
|
||||
<tr><th>Size</th><th>Ratio in %</th><th>Bits/Integer</th><th>C Time MB/s</th><th>D Time MB/s</th><th>Function</th></tr>
|
||||
<tr><th> 514438405</th><th>8.16</th><th>2.61</th><th>357.22</th><th>1286.42</th><th>TurboPFor</th></tr>
|
||||
<tr><th> 514438405</th><th>8.16</th><th>2.61</th><th>358.09</th><th>309.70</th><th>TurboPFor DA</th></tr>
|
||||
<tr><th> 539841792</th><th>8.56</th><th>2.74</th><th>6.47</th><th>767.35</th><th>OptP4</th></tr>
|
||||
<tr><th> 583184112</th><th>9.25</th><th>2.96</th><th>132.42</th><th>914.89</th><th>Simple16</th></tr>
|
||||
<tr><th> 623548565</th><th>9.89</th><th>3.17</th><th>235.32</th><th>925.71</th><th>SimpleV</th></tr>
|
||||
<tr><th> 733365952</th><th>11.64</th><th>3.72</th><th>162.21</th><th>1312.15</th><th>Simple64</th></tr>
|
||||
<tr><th> 862464289</th><th>13.68</th><th>4.38</th><th>1274.01</th><th>1980.55</th><th>TurboPack</th></tr>
|
||||
<tr><th> 862464289</th><th>13.68</th><th>4.38</th><th>1285.28</th><th>868.06</th><th>TurboPack DA</th></tr>
|
||||
<tr><th> 862465391</th><th>13.68</th><th>4.38</th><th>1402.12</th><th>2075.15</th><th>SIMD-BitPack FPF</th></tr>
|
||||
<tr><th>6303089028</th><th>100.00</th><th>32.00</th><th>1257.50</th><th>1308.22</th><th>copy</th></tr>
|
||||
<tr><th> 63392801</th><th>15.85</th><th> 5.07</th><th> 316.96</th><th> 893.67</th><th>TurboPFor</th></tr>
|
||||
<tr><th> 63392801</th><th>15.85</th><th> 5.07</th><th> 315.59</th><th> 227.15</th><th>TurboPForDA</th></tr>
|
||||
<tr><th> 65359916</th><th>16.34</th><th> 5.23</th><th> 7.09</th><th> 638.96</th><th>OptPFD</th></tr>
|
||||
<tr><th> 72364024</th><th>18.09</th><th> 5.79</th><th> 85.31</th><th> 762.00</th><th>Simple16</th></tr>
|
||||
<tr><th> 78514276</th><th>19.63</th><th> 6.28</th><th> 229.21</th><th> 748.32</th><th>SimpleV</th></tr>
|
||||
<tr><th> 95915096</th><th>23.98</th><th> 7.67</th><th> 221.46</th><th> 1049.70</th><th>Simple-8b</th></tr>
|
||||
<tr><th> 99910930</th><th>24.98</th><th> 7.99</th><th> 1553.92</th><th> 1904.21</th><th>SIMDPackFPF</th></tr>
|
||||
<tr><th> 99910930</th><th>24.98</th><th> 7.99</th><th> 953.29</th><th> 1872.02</th><th>TurboPack</th></tr>
|
||||
<tr><th> 99910930</th><th>24.98</th><th> 7.99</th><th> 953.13</th><th> 869.84</th><th>TurboPackDA</th></tr>
|
||||
<tr><th>102074663</th><th>25.52</th><th> 8.17</th><th> 1131.47</th><th> 1184.68</th><th>TurboVbyte</th></tr>
|
||||
<tr><th>102074663</th><th>25.52</th><th> 8.17</th><th> 1110.75</th><th> 897.86</th><th>VbyteFPF</th></tr>
|
||||
<tr><th>112500000</th><th>28.12</th><th> 9.00</th><th> 305.85</th><th> 1899.15</th><th>VarintG8IU</th></tr>
|
||||
<tr><th>400000000</th><th>100.00</th><th>32.00</th><th> 1470.87</th><th> 1477.93</th><th>Copy</th></tr>
|
||||
</table>
|
||||
|
||||
#### data files
|
||||
- gov2.sorted (from http://lemire.me/data/integercompression2014.html) Blocksize=128<br />
|
||||
(+ SimpleV 64k). Benchmark repeated several times.
|
||||
|
||||
*./icbench -c1 gov2.sorted*
|
||||
|
||||
<table>
|
||||
<tr><th>Size</th><th>Ratio in %</th><th>Bits/Integer</th><th>C Time MB/s</th><th>D Time MB/s</th><th>Function</th></tr>
|
||||
<tr><th> 3214763689</th><th>13.44</th><th>4.30</th><th>279.93</th><th> 665.41</th><th>SimpleV 64k</th></tr>
|
||||
<tr><th> 3337758854</th><th>13.95</th><th>4.47</th><th>5.06</th><th> 513.00</th><th>OptPFD</th></tr>
|
||||
<tr><th> 3357673495</th><th>14.04</th><th>4.49</th><th>270.57</th><th> 813.83</th><th>TurboPFor</th></tr>
|
||||
<tr><th> 3501671314</th><th>14.64</th><th>4.68</th><th>258.56</th><th> 720.76</th><th>SimpleV</th></tr>
|
||||
<tr><th> 3820190182</th><th>15.97</th><th>5.11</th><th>118.81</th><th> 650.21</th><th>Simple16</th></tr>
|
||||
<tr><th> 4521326518</th><th>18.90</th><th>6.05</th><th>209.17</th><th> 824.26</th><th>Simple-8b</th></tr>
|
||||
<tr><th> 4953768342</th><th>20.71</th><th>6.63</th><th>647.75</th><th>1501.24</th><th>TurboPack</th></tr>
|
||||
<tr><th> 5203353057</th><th>21.75</th><th>6.96</th><th>1560.34</th><th>1806.60</th><th>SIMDPackFPF D1</th></tr>
|
||||
<tr><th> 6699519000</th><th>28.01</th><th>8.96</th><th>502.86</th><th> 624.12</th><th>TurboVbyte</th></tr>
|
||||
<tr><th> 6699519000</th><th>28.01</th><th>8.96</th><th>472.01</th><th> 495.12</th><th>VbyteFPF</th></tr>
|
||||
<tr><th> 7622896878</th><th>31.87</th><th>10.20</th><th>208.73</th><th>1197.74</th><th>VarintG8IU</th></tr>
|
||||
<tr><th>23918861764</th><th>100.00</th><th>32.00</th><th>1391.82</th><th>1420.03</th><th>Copy</th></tr>
|
||||
</table>
|
||||
|
||||
## Compile:
|
||||
make
|
||||
*make*
|
||||
|
||||
## Benchmark
|
||||
## Testing
|
||||
###### Synthetic data:
|
||||
1. test all functions<br />
|
||||
|
||||
*./icbench -a1.0 -m0 -x8 -n100000000*
|
||||
*./icbench -a1.0 -m0 -M8 -n100000000*
|
||||
|
||||
- zipfian distribution alpha = 1.0 (Ex. -a1.0=uniform -a1.5=skewed distribution)
|
||||
- number of integers = 100000000
|
||||
@ -68,7 +88,7 @@ coming soon!
|
||||
|
||||
2. individual function test (ex. copy TurboPack TurboPack Direct access)<br />
|
||||
|
||||
*./icbench -a1.0 -m0 -x8 -ecopy/turbopack/turbopackda -n100000000*
|
||||
*./icbench -a1.5 -m0 -M8 -ecopy/turbopack/turbopackda -n100000000*
|
||||
|
||||
###### Data files:
|
||||
- Data file Benchmark (file format as in FastPFOR)
|
||||
@ -76,10 +96,10 @@ coming soon!
|
||||
*./icbench -c1 gov2.sorted*
|
||||
|
||||
###### Benchmarking intersections
|
||||
- Download "gov2.sorted" (or clueweb09) + query file "aol.txt"
|
||||
- Download gov2 (or clueweb09) + query file "aol.txt"
|
||||
from "http://lemire.me/data/integercompression2014.html"
|
||||
|
||||
- Create index file gov2.sorted.i
|
||||
- Create index file
|
||||
|
||||
*./idxcr gov2.sorted .*
|
||||
|
||||
@ -91,7 +111,7 @@ coming soon!
|
||||
|
||||
run queries in file "aol.txt" over the index of gov2 file
|
||||
|
||||
8GB Minimum of RAM required (16GB recommended for benchmarking "clueweb09" files).
|
||||
8GB RAM required (16GB recommended for benchmarking "clueweb09" files).
|
||||
|
||||
|
||||
## Function usage:
|
||||
@ -122,4 +142,3 @@ header files with documentation :<br />
|
||||
- OptP4 and Simple-16 from http://jinruhe.com/
|
||||
|
||||
#------------------------------------------------
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
// 64 bits version from: Vo Ngoc Anh, Alistair Moffat: Index compression using 64-bit words.
|
||||
// Softw., Pract. Exper. 40(2): 131-147 (2010)
|
||||
// http://ww2.cs.mu.oz.au/~alistair/coders-64bit/
|
||||
|
||||
#include "simple8b.h"
|
||||
#if defined(__x86_64__) || defined(__x86_32__)
|
||||
static inline int bsr32(int x) {
|
||||
int b = -1;
|
||||
@ -50,7 +50,7 @@ BIT_2_SEL
|
||||
|
||||
unsigned char *vs8benc(unsigned *__restrict in, int n, unsigned char *__restrict out) {
|
||||
unsigned long long __bw; unsigned __br = 0;
|
||||
unsigned char bits[0x1000];
|
||||
unsigned char bits[SIMPLE8BMAX];
|
||||
int elems;
|
||||
int i,j;
|
||||
for (i = 0; i < n; i++)
|
||||
|
@ -1,2 +1,3 @@
|
||||
#define SIMPLE8BMAX 1024
|
||||
unsigned char *vs8benc(unsigned *__restrict in, int n, unsigned char *__restrict out);
|
||||
unsigned char *vs8bdec(unsigned char *__restrict in, int n, unsigned *__restrict out);
|
||||
|
52
icbench.c
52
icbench.c
@ -36,11 +36,11 @@
|
||||
#include <sys/stat.h>
|
||||
#include <x86intrin.h>
|
||||
|
||||
// simple-8b simple16 optpfd don't work with all interger lists.
|
||||
// simple-8b simple16 and optpfd don't work with all interger lists.
|
||||
// Enable if you to want to test
|
||||
//#define USE_SIMPLE_8B // crashs on some lists
|
||||
//#define USE_SIMPLE16 // limited to 28 bits
|
||||
//#define USE_OPTPFD // compression too slow and limited to 28 bits. crashs on some lists
|
||||
#define USE_SIMPLE_8B // crashs on some lists
|
||||
#define USE_SIMPLE16 // limited to 28 bits
|
||||
#define USE_OPTPFD // compression too slow and limited to 28 bits. crashs on some lists
|
||||
#define STATS
|
||||
//---------------------------------------- Platform ------------------------
|
||||
#ifdef _WIN32
|
||||
@ -73,22 +73,29 @@ unsigned char *u32enc(unsigned *__restrict in, int n, unsigned *__restrict out)
|
||||
unsigned char *u32dec(unsigned *__restrict in, int n, unsigned *__restrict out) { unsigned *out_ = out+n; while(out < out_) *out++ = *in++; return (unsigned char *)in; }
|
||||
|
||||
#define PAD8(__x) (((__x)+7)/8)
|
||||
unsigned char *_bitunpackx32(unsigned char *__restrict in, unsigned n, unsigned b, unsigned *__restrict out) { unsigned i,k=0; for(i=0; i < n; i++,k+=b ) *out++ = _bitgetx32(in, b, k); return in + PAD8(n*b); }
|
||||
unsigned char *_bitunpackx32( unsigned char *__restrict in, unsigned n, unsigned b, unsigned *__restrict out) { unsigned i,k=0; for(i=0; i < n; i++,k+=b ) *out++ = _bitgetx32(in, b, k); return in + PAD8(n*b); }
|
||||
|
||||
unsigned char *bitdunpackx32( unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned *__restrict out) { int i; for(i = 0; i < n; i++) out[i] = (start += bitgetx32(in, b, i)+1); return in + PAD8(n*b); }
|
||||
unsigned char *bitd0unpackx32(unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned *__restrict out) { int i; for(i = 0; i < n; i++) out[i] = (start += bitgetx32(in, b, i)+1); return in + PAD8(n*b); }
|
||||
unsigned char *bitfunpackx32( unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned *__restrict out) { int i; for(i = 0; i < n; i++) out[i] = bitgetx32(in, b, i)+start+i+1; return in + PAD8(n*b); }
|
||||
unsigned char *bitf0unpackx32(unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned *__restrict out) { int i; for(i = 0; i < n; i++) out[i] = bitgetx32(in, b, i)+start; return in + PAD8(n*b); }
|
||||
unsigned char *bitdunpackx32( unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned *__restrict out) { int i; for(i = 0; i < n; i++) out[i] = (start += bitgetx32(in, b, i)+1); return in + PAD8(n*b); }
|
||||
unsigned char *bitd0unpackx32(unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned *__restrict out) { int i; for(i = 0; i < n; i++) out[i] = (start += bitgetx32(in, b, i)+1); return in + PAD8(n*b); }
|
||||
unsigned char *bitfunpackx32( unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned *__restrict out) { int i; for(i = 0; i < n; i++) out[i] = bitgetx32(in, b, i)+start+i+1; return in + PAD8(n*b); }
|
||||
unsigned char *bitf0unpackx32(unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned *__restrict out) { int i; for(i = 0; i < n; i++) out[i] = bitgetx32(in, b, i)+start; return in + PAD8(n*b); }
|
||||
//-------------------------------------- External functions for comparison ------------------------------------------------------------------------
|
||||
#include "ext/vas16c.h"
|
||||
#include "ext/vas16d.h"
|
||||
#include "ext/OPT_PFD/opt_p4.h"
|
||||
#include "ext/vabyte.h"
|
||||
#include "ext/simple8b.h"
|
||||
#include "ext/varintg8iu.h"
|
||||
#include "ext/varintg8iu.h"
|
||||
#include "ext/simdcomp/include/simdbitpacking.h"
|
||||
|
||||
#ifdef USE_SIMPLE16
|
||||
#include "ext/vas16c.h"
|
||||
#include "ext/vas16d.h"
|
||||
#endif
|
||||
#ifdef USE_OPTPFD
|
||||
#include "ext/OPT_PFD/opt_p4.h"
|
||||
#endif
|
||||
#ifdef USE_SIMPLE_8B
|
||||
#include "ext/simple8b.h"
|
||||
#endif
|
||||
|
||||
unsigned char *simdpackwn(uint32_t *in, uint32_t n, uint32_t b, uint32_t *out) {//checkifdivisibleby(n, 128); const uint32_t * const initout(out); //while(needPaddingTo128Bits(out)) *out++ = 123456;
|
||||
uint32_t *in_;
|
||||
for(in_ = in + n; in + 128 <= in_; in += 128, out += 4 * b) simdpackwithoutmask(in, (__m128i *)out, b);
|
||||
@ -246,7 +253,14 @@ unsigned char *besenc(unsigned *in, size_t n, unsigned char *out, int id, int mo
|
||||
}
|
||||
}
|
||||
|
||||
#define UNDELTA(__out, __n, __mode) { unsigned _x,_v; for(_x = __out[0],_v=1;_v<__n;_v++) __out[_v] = (_x += __out[_v] + __mode); }
|
||||
#define UNDELTA(__out, __n, __mode) { unsigned _x,_v;\
|
||||
for(_x = __out[0],_v=1;_v<__n;_v+=4) {\
|
||||
__out[_v ] = (_x += __out[_v ] + __mode);\
|
||||
__out[_v+1] = (_x += __out[_v+1] + __mode);\
|
||||
__out[_v+2] = (_x += __out[_v+2] + __mode);\
|
||||
__out[_v+3] = (_x += __out[_v+3] + __mode);\
|
||||
}\
|
||||
}
|
||||
|
||||
unsigned char *besdec(unsigned char *in, size_t n, unsigned *out, int id, int mode) { unsigned b,x,v;
|
||||
switch(id) {
|
||||
@ -331,7 +345,7 @@ struct libss libss[] = {
|
||||
// -------------- Simple family ----------------------------------
|
||||
{ P_SV, "SimpleV", 0 },
|
||||
#ifdef USE_SIMPLE_8B
|
||||
{ P_S64, "Simple-8b",0 }, //crash on 32 bits?
|
||||
{ P_S64, "Simple-8b",SIMPLE8BMAX }, //crash on 32 bits?
|
||||
#endif
|
||||
#ifdef USE_SIMPLE16
|
||||
{ P_S16, "Simple16", 0 }, //max. 28 bits
|
||||
@ -384,7 +398,7 @@ void stprint() {
|
||||
unsigned long long t=0;
|
||||
for(m = 0; m < 33; m++)
|
||||
t += xbits[m];
|
||||
printf("\ndistribution:");
|
||||
printf("\nbit size histogramm:");
|
||||
for(m = 0; m < 33; m++)
|
||||
if(xbits[m]) printf("%d:%.2f%% ", m, (double)xbits[m]*100/t); printf("\n");
|
||||
}
|
||||
@ -464,13 +478,13 @@ void usage() {
|
||||
fprintf(stderr, "<options>\n");
|
||||
fprintf(stderr, " -bNm N = blocksize (default 128) m=k kilobyte ex. -b64k\n");
|
||||
fprintf(stderr, " -cN N = format ordered(0:delta+0,1:delta+1),2=convert text to integer format\n");
|
||||
fprintf(stderr, " -eS N = encoder scheme (default all)\n");
|
||||
fprintf(stderr, " -eS S = encoder schemes sparated by / (default all)\n");
|
||||
fprintf(stderr, " -tN N = time in seconds per interation\n");
|
||||
fprintf(stderr, " -TN N = Iterations (default 3)\n");
|
||||
fprintf(stderr, " -vN N = verbosity 1..3\n");
|
||||
fprintf(stderr, "----- file specified --------------\n");
|
||||
fprintf(stderr, " -rN N = max. file size to read\n");
|
||||
fprintf(stderr, "Ex. ./icbench -c1 gov2.sorted\n");
|
||||
fprintf(stderr, "Ex. ./icbench -c1 gov2.sorted -eturbopack/turbopfor\n");
|
||||
fprintf(stderr, "----- file not specified --------------\n");
|
||||
fprintf(stderr, " -aF F = zipfian distribution alpha ex. -a1.0 uniform -a1.5 skewed\n");
|
||||
fprintf(stderr, " -mN N = minimum integer generated in bits\n");
|
||||
@ -567,7 +581,7 @@ int main(int argc, char *argv[]) { int r;
|
||||
print(totlen, s, NULL);
|
||||
}
|
||||
} else { // Benchmark w. generated data
|
||||
printf("zipf alpha=%3.1f range[%u..%u].\nbit size histogramm: ", a, rm, rx);
|
||||
printf("zipf alpha=%3.1f range[%u..%u].\n ", a, rm, rx);
|
||||
*in = n;
|
||||
zipfgen(in+1, a, rm, rx, n); for(i = 1; i <= n; i++) xbits[bsr32(in[i])]++; stprint();
|
||||
if(mode>=0) { unsigned *ip=in+1; int v; for(v = 1; v < n; v++) { ip[v] += ip[v-1] + mode; if(ip[v]>(1u<<28)) die("overflow generating sorted array\n" ); } }
|
||||
|
4
idxqry.c
4
idxqry.c
@ -233,7 +233,7 @@ int postcmp(post_t *a, post_t *b) {
|
||||
|
||||
int intersec_max;
|
||||
|
||||
unsigned idxsearch(idxrd_t *idx, qry_t *q) {
|
||||
unsigned qrysearch(qry_t *q, idxrd_t *idx) {
|
||||
int f_t = 0, i;
|
||||
post_t *p, *pe, post[TERMNUM];
|
||||
unsigned did, elim, dids[TERMNUM][BLK_DIDNUM+31];
|
||||
@ -300,7 +300,7 @@ unsigned long long qrybatch(idxrd_t *idx, char *fqname, int *qid) {
|
||||
}
|
||||
if(qry.terms >= temin && qry.terms <= temax) { //int j; for(j=0;j < qry.terms;j++) { if(j) printf(" "); printf("%u", qry.term[j]); } printf(" %d \n", qry.terms);
|
||||
qry.id = ++id; tex = max(qry.terms,tex);
|
||||
f_t += idxsearch(idx, &qry); if(id >= qmax) break;
|
||||
f_t += qrysearch(&qry, idx); if(id >= qmax) break;
|
||||
}
|
||||
}
|
||||
fclose(fq);
|
||||
|
6
makefile
6
makefile
@ -8,13 +8,13 @@ bitunpack.o: $(BIT)bitunpack.c $(BIT)bitunpack_.h $(BIT)bitunpack.h $(BIT)bitunp
|
||||
gcc -O3 $(CFLAGS) -c $(BIT)bitunpack.c
|
||||
|
||||
bitpack.o: $(BIT)bitpack.c $(BIT)bitpack_.h $(BIT)bitpack.h $(BIT)bitpack64_.h
|
||||
gcc -O2 $(CFLAGS2) -c $(BIT)bitpack.c
|
||||
gcc -O2 $(CFLAGS) -c $(BIT)bitpack.c
|
||||
|
||||
vp4dc.o: $(BIT)vp4dc.c
|
||||
gcc -O3 $(CFLAGS2) -funroll-loops -c $(BIT)vp4dc.c
|
||||
gcc -O3 $(CFLAGS) -funroll-loops -c $(BIT)vp4dc.c
|
||||
|
||||
vp4dd.o: $(BIT)vp4dd.c
|
||||
gcc -O3 $(CFLAGS2) -funroll-loops -c $(BIT)vp4dd.c
|
||||
gcc -O3 $(CFLAGS) -funroll-loops -c $(BIT)vp4dd.c
|
||||
|
||||
SIMDCOMPD=ext/simdcomp/
|
||||
SIMDCOMP=$(SIMDCOMPD)bitpacka.o $(SIMDCOMPD)src/simdintegratedbitpacking.o $(SIMDCOMPD)src/simdcomputil.o $(SIMDCOMPD)src/simdbitpacking.o
|
||||
|
Reference in New Issue
Block a user