New functions + Integer Intersections

2015-01-06 14:28:11 +01:00
parent 736ac2e538
commit 17ef7510ac
6 changed files with 87 additions and 53 deletions
--- a/README.md
+++ b/README.md
@ -5,11 +5,12 @@ TurboPFor: Fastest Integer Compression [![Build Status](https://travis-ci.org/po
 <p>
 - Fastest **"Variable Byte"** implementation
 <p>
- Novel **"Variable Simple"** faster than simple16 and more compact than simple64
+- Novel **"Variable Simple"** faster than simple16 and more compact than simple8-b
 <p>
 - Scalar **"Bit Packing"** with bulk decoding as fast as SIMD FastPFor in realistic and practical (No "pure cache") scenarios
 - Bit Packing with **Direct/Random Access** without decompressing entire blocks
 - Access any single bit packed entry with **zero decompression**
+- **New:** **Direct Update** of individual bit packed entries
 - Reducing **Cache Pollution**
 <p>
 - Novel **"TurboPFor"** (Patched Frame-of-Reference) scheme with direct access or bulk decoding.
@ -26,41 +27,60 @@ TurboPFor: Fastest Integer Compression [![Build Status](https://travis-ci.org/po
 - Decompress only the minimum necessary blocks.

 # Benchmark:
-i7-2600k at 3.4GHz, gcc 4.9, ubuntu 14.10.
- Single thread
+i7-2600k at 3.4GHz, gcc 4.9, ubuntu 14.10, Single thread.
 - Realistic and practical benchmark with large integer arrays.
 - No PURE cache benchmark

 #### Synthetic data: 
-coming soon!
-
-#### data files
- - clueweb09.sorted from FastPFor (http://lemire.me/data/integercompression2014.html)<br />
-
-   *./icbench -c1 -n10000000000 clueweb09.sorted*
+    *./icbench -a1.5 -m0 -M8 -n100000000*

 <table>
 <tr><th>Size</th><th>Ratio in %</th><th>Bits/Integer</th><th>C Time MB/s</th><th>D Time MB/s</th><th>Function</th></tr>
-<tr><th> 514438405</th><th>8.16</th><th>2.61</th><th>357.22</th><th>1286.42</th><th>TurboPFor</th></tr>
-<tr><th> 514438405</th><th>8.16</th><th>2.61</th><th>358.09</th><th>309.70</th><th>TurboPFor DA</th></tr>
-<tr><th> 539841792</th><th>8.56</th><th>2.74</th><th>6.47</th><th>767.35</th><th>OptP4</th></tr>
-<tr><th> 583184112</th><th>9.25</th><th>2.96</th><th>132.42</th><th>914.89</th><th>Simple16</th></tr>
-<tr><th> 623548565</th><th>9.89</th><th>3.17</th><th>235.32</th><th>925.71</th><th>SimpleV</th></tr>
-<tr><th> 733365952</th><th>11.64</th><th>3.72</th><th>162.21</th><th>1312.15</th><th>Simple64</th></tr>
-<tr><th> 862464289</th><th>13.68</th><th>4.38</th><th>1274.01</th><th>1980.55</th><th>TurboPack</th></tr>
-<tr><th> 862464289</th><th>13.68</th><th>4.38</th><th>1285.28</th><th>868.06</th><th>TurboPack DA</th></tr>
-<tr><th> 862465391</th><th>13.68</th><th>4.38</th><th>1402.12</th><th>2075.15</th><th>SIMD-BitPack FPF</th></tr>
-<tr><th>6303089028</th><th>100.00</th><th>32.00</th><th>1257.50</th><th>1308.22</th><th>copy</th></tr>
+<tr><th> 63392801</th><th>15.85</th><th> 5.07</th><th>  316.96</th><th>  893.67</th><th>TurboPFor</th></tr>
+<tr><th> 63392801</th><th>15.85</th><th> 5.07</th><th>  315.59</th><th>  227.15</th><th>TurboPForDA</th></tr>
+<tr><th> 65359916</th><th>16.34</th><th> 5.23</th><th>    7.09</th><th>  638.96</th><th>OptPFD</th></tr>
+<tr><th> 72364024</th><th>18.09</th><th> 5.79</th><th>   85.31</th><th>  762.00</th><th>Simple16</th></tr>
+<tr><th> 78514276</th><th>19.63</th><th> 6.28</th><th>  229.21</th><th>  748.32</th><th>SimpleV</th></tr>
+<tr><th> 95915096</th><th>23.98</th><th> 7.67</th><th>  221.46</th><th> 1049.70</th><th>Simple-8b</th></tr>
+<tr><th> 99910930</th><th>24.98</th><th> 7.99</th><th> 1553.92</th><th> 1904.21</th><th>SIMDPackFPF</th></tr>
+<tr><th> 99910930</th><th>24.98</th><th> 7.99</th><th>  953.29</th><th> 1872.02</th><th>TurboPack</th></tr>
+<tr><th> 99910930</th><th>24.98</th><th> 7.99</th><th>  953.13</th><th>  869.84</th><th>TurboPackDA</th></tr>
+<tr><th>102074663</th><th>25.52</th><th> 8.17</th><th> 1131.47</th><th> 1184.68</th><th>TurboVbyte</th></tr>
+<tr><th>102074663</th><th>25.52</th><th> 8.17</th><th> 1110.75</th><th>  897.86</th><th>VbyteFPF</th></tr>
+<tr><th>112500000</th><th>28.12</th><th> 9.00</th><th>  305.85</th><th> 1899.15</th><th>VarintG8IU</th></tr>
+<tr><th>400000000</th><th>100.00</th><th>32.00</th><th> 1470.87</th><th> 1477.93</th><th>Copy</th></tr>
+</table>
+
+#### data files
+ - gov2.sorted (from http://lemire.me/data/integercompression2014.html) Blocksize=128<br />
+    (+ SimpleV 64k). Benchmark repeated several times.
+
+   *./icbench -c1 gov2.sorted*
+   
+<table>
+<tr><th>Size</th><th>Ratio in %</th><th>Bits/Integer</th><th>C Time MB/s</th><th>D Time MB/s</th><th>Function</th></tr>
+<tr><th> 3214763689</th><th>13.44</th><th>4.30</th><th>279.93</th><th> 665.41</th><th>SimpleV 64k</th></tr>
+<tr><th> 3337758854</th><th>13.95</th><th>4.47</th><th>5.06</th><th> 513.00</th><th>OptPFD</th></tr>
+<tr><th> 3357673495</th><th>14.04</th><th>4.49</th><th>270.57</th><th> 813.83</th><th>TurboPFor</th></tr>
+<tr><th> 3501671314</th><th>14.64</th><th>4.68</th><th>258.56</th><th> 720.76</th><th>SimpleV</th></tr>
+<tr><th> 3820190182</th><th>15.97</th><th>5.11</th><th>118.81</th><th> 650.21</th><th>Simple16</th></tr>
+<tr><th> 4521326518</th><th>18.90</th><th>6.05</th><th>209.17</th><th> 824.26</th><th>Simple-8b</th></tr>
+<tr><th> 4953768342</th><th>20.71</th><th>6.63</th><th>647.75</th><th>1501.24</th><th>TurboPack</th></tr>
+<tr><th> 5203353057</th><th>21.75</th><th>6.96</th><th>1560.34</th><th>1806.60</th><th>SIMDPackFPF D1</th></tr>
+<tr><th> 6699519000</th><th>28.01</th><th>8.96</th><th>502.86</th><th> 624.12</th><th>TurboVbyte</th></tr>
+<tr><th> 6699519000</th><th>28.01</th><th>8.96</th><th>472.01</th><th> 495.12</th><th>VbyteFPF</th></tr>
+<tr><th> 7622896878</th><th>31.87</th><th>10.20</th><th>208.73</th><th>1197.74</th><th>VarintG8IU</th></tr>
+<tr><th>23918861764</th><th>100.00</th><th>32.00</th><th>1391.82</th><th>1420.03</th><th>Copy</th></tr>
 </table>

 ## Compile:
-  make
+  *make*

-## Benchmark
+## Testing
 ###### Synthetic data: 
  1. test all functions<br />

-    *./icbench -a1.0 -m0 -x8 -n100000000*
+    *./icbench -a1.0 -m0 -M8 -n100000000*

    - zipfian distribution alpha = 1.0 (Ex. -a1.0=uniform -a1.5=skewed distribution)
    - number of integers = 100000000
@ -68,7 +88,7 @@ coming soon!
  
  2. individual function test (ex. copy TurboPack TurboPack Direct access)<br />

-    *./icbench -a1.0 -m0 -x8 -ecopy/turbopack/turbopackda -n100000000*
+    *./icbench -a1.5 -m0 -M8 -ecopy/turbopack/turbopackda -n100000000*

 ###### Data files: 
  - Data file Benchmark (file format as in FastPFOR)
@ -76,10 +96,10 @@ coming soon!
    *./icbench -c1 gov2.sorted*

 ###### Benchmarking intersections
-  - Download "gov2.sorted" (or clueweb09) + query file "aol.txt" 
+  - Download gov2 (or clueweb09) + query file "aol.txt" 
    from "http://lemire.me/data/integercompression2014.html"

-  - Create index file gov2.sorted.i
+  - Create index file

    *./idxcr gov2.sorted .*

@ -91,7 +111,7 @@ coming soon!

    run queries in file "aol.txt" over the index of gov2 file

-   8GB Minimum of RAM required (16GB recommended for benchmarking "clueweb09" files).
+   8GB RAM required (16GB recommended for benchmarking "clueweb09" files).


 ## Function usage:
@ -122,4 +142,3 @@ header files with documentation :<br />
 - OptP4 and Simple-16 from http://jinruhe.com/ 

 #------------------------------------------------
-
--- a/ext/simple8b.c
+++ b/ext/simple8b.c
@ -2,7 +2,7 @@
 // 64 bits version from: Vo Ngoc Anh, Alistair Moffat: Index compression using 64-bit words.
 // Softw., Pract. Exper. 40(2): 131-147 (2010)
 // http://ww2.cs.mu.oz.au/~alistair/coders-64bit/
-
+#include "simple8b.h"
  #if defined(__x86_64__) || defined(__x86_32__)
 static inline int bsr32(int x) {
  int b = -1;
@ -50,7 +50,7 @@ BIT_2_SEL
                       
 unsigned char *vs8benc(unsigned *__restrict in, int n, unsigned char *__restrict out) { 
  unsigned long long __bw; unsigned __br = 0; 
-  unsigned char bits[0x1000];                          
+  unsigned char bits[SIMPLE8BMAX];                          
  int elems;       
  int i,j;
  for (i = 0; i < n; i++) 
--- a/ext/simple8b.h
+++ b/ext/simple8b.h
@ -1,2 +1,3 @@
+#define SIMPLE8BMAX 1024
 unsigned char *vs8benc(unsigned      *__restrict in, int n, unsigned char *__restrict out);
 unsigned char *vs8bdec(unsigned char *__restrict in, int n, unsigned *__restrict out);
--- a/icbench.c
+++ b/icbench.c
@ -36,11 +36,11 @@
 #include <sys/stat.h>
 #include <x86intrin.h>

-// simple-8b simple16 optpfd don't work with all interger lists.
+// simple-8b simple16 and optpfd don't work with all interger lists.
 // Enable if you to want to test
-//#define USE_SIMPLE_8B  // crashs on some lists 
-//#define USE_SIMPLE16   // limited to 28 bits
-//#define USE_OPTPFD     // compression too slow and limited to 28 bits. crashs on some lists
+#define USE_SIMPLE_8B  // crashs on some lists 
+#define USE_SIMPLE16   // limited to 28 bits
+#define USE_OPTPFD     // compression too slow and limited to 28 bits. crashs on some lists
 #define STATS
 //---------------------------------------- Platform ------------------------
  #ifdef _WIN32
@ -73,22 +73,29 @@ unsigned char *u32enc(unsigned *__restrict in, int n, unsigned *__restrict out)
 unsigned char *u32dec(unsigned *__restrict in, int n, unsigned *__restrict out) { unsigned *out_  = out+n; while(out < out_) *out++ = *in++; return (unsigned char *)in; }

 #define PAD8(__x) (((__x)+7)/8)
-unsigned char *_bitunpackx32(unsigned char *__restrict in, unsigned n, unsigned b, unsigned *__restrict out) { unsigned i,k=0; for(i=0; i < n; i++,k+=b ) *out++ = _bitgetx32(in, b, k); return in + PAD8(n*b); }
+unsigned char *_bitunpackx32( unsigned char *__restrict in, unsigned n, unsigned b,            unsigned *__restrict out) { unsigned i,k=0; for(i=0; i < n; i++,k+=b ) *out++ = _bitgetx32(in, b, k); return in + PAD8(n*b); }

-unsigned char *bitdunpackx32( unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned       *__restrict out) { int i; for(i = 0; i < n; i++) out[i] = (start += bitgetx32(in, b, i)+1); return in + PAD8(n*b); }
-unsigned char *bitd0unpackx32(unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned       *__restrict out) { int i; for(i = 0; i < n; i++) out[i] = (start += bitgetx32(in, b, i)+1); return in + PAD8(n*b); }
-unsigned char *bitfunpackx32( unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned       *__restrict out) { int i; for(i = 0; i < n; i++) out[i] = bitgetx32(in, b, i)+start+i+1; return in + PAD8(n*b); }
-unsigned char *bitf0unpackx32(unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned       *__restrict out) { int i; for(i = 0; i < n; i++) out[i] = bitgetx32(in, b, i)+start; return in + PAD8(n*b); }
+unsigned char *bitdunpackx32( unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned *__restrict out) { int i; for(i = 0; i < n; i++) out[i] = (start += bitgetx32(in, b, i)+1); return in + PAD8(n*b); }
+unsigned char *bitd0unpackx32(unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned *__restrict out) { int i; for(i = 0; i < n; i++) out[i] = (start += bitgetx32(in, b, i)+1); return in + PAD8(n*b); }
+unsigned char *bitfunpackx32( unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned *__restrict out) { int i; for(i = 0; i < n; i++) out[i] = bitgetx32(in, b, i)+start+i+1; return in + PAD8(n*b); }
+unsigned char *bitf0unpackx32(unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned *__restrict out) { int i; for(i = 0; i < n; i++) out[i] = bitgetx32(in, b, i)+start; return in + PAD8(n*b); }
 //-------------------------------------- External functions for comparison ------------------------------------------------------------------------
-#include "ext/vas16c.h"
-#include "ext/vas16d.h"
-#include "ext/OPT_PFD/opt_p4.h"
 #include "ext/vabyte.h"
-#include "ext/simple8b.h"
 #include "ext/varintg8iu.h"
 #include "ext/varintg8iu.h"
 #include "ext/simdcomp/include/simdbitpacking.h"

+  #ifdef USE_SIMPLE16
+#include "ext/vas16c.h"
+#include "ext/vas16d.h"
+  #endif
+  #ifdef USE_OPTPFD
+#include "ext/OPT_PFD/opt_p4.h"
+  #endif
+  #ifdef USE_SIMPLE_8B
+#include "ext/simple8b.h"
+  #endif
+
 unsigned char *simdpackwn(uint32_t *in, uint32_t n, uint32_t b, uint32_t *out) {//checkifdivisibleby(n, 128); const uint32_t * const initout(out);  //while(needPaddingTo128Bits(out)) *out++ = 123456;
  uint32_t *in_;
  for(in_ = in + n; in + 128 <= in_; in += 128, out += 4 * b) simdpackwithoutmask(in, (__m128i *)out, b);
@ -246,7 +253,14 @@ unsigned char *besenc(unsigned *in, size_t n, unsigned char *out, int id, int mo
  }
 } 

-#define UNDELTA(__out, __n, __mode) { unsigned _x,_v; for(_x = __out[0],_v=1;_v<__n;_v++) __out[_v] = (_x += __out[_v] + __mode); }
+#define UNDELTA(__out, __n, __mode) { unsigned _x,_v;\
+  for(_x = __out[0],_v=1;_v<__n;_v+=4) {\
+    __out[_v  ] = (_x += __out[_v  ] + __mode);\
+    __out[_v+1] = (_x += __out[_v+1] + __mode);\
+    __out[_v+2] = (_x += __out[_v+2] + __mode);\
+    __out[_v+3] = (_x += __out[_v+3] + __mode);\
+  }\
+}

 unsigned char *besdec(unsigned char *in, size_t n, unsigned *out, int id, int mode) { unsigned b,x,v;
  switch(id) {
@ -331,7 +345,7 @@ struct libss libss[] = {
  // -------------- Simple family ---------------------------------- 
  { P_SV,    "SimpleV",  0 	},
    #ifdef USE_SIMPLE_8B
-  { P_S64,   "Simple-8b",0 	},  //crash on 32 bits?
+  { P_S64,   "Simple-8b",SIMPLE8BMAX 	},  //crash on 32 bits?
    #endif
    #ifdef USE_SIMPLE16
  { P_S16,   "Simple16", 0 	},  //max. 28 bits
@ -384,7 +398,7 @@ void stprint() {
  unsigned long long t=0; 
  for(m = 0; m < 33; m++) 
    t += xbits[m];
-  printf("\ndistribution:"); 
+  printf("\nbit size histogramm:"); 
  for(m = 0; m < 33; m++) 
    if(xbits[m]) printf("%d:%.2f%% ", m, (double)xbits[m]*100/t); printf("\n");
 }
@ -464,13 +478,13 @@ void usage() {
  fprintf(stderr, "<options>\n");
  fprintf(stderr, " -bNm    N = blocksize (default 128) m=k kilobyte ex. -b64k\n");
  fprintf(stderr, " -cN     N = format ordered(0:delta+0,1:delta+1),2=convert text to integer format\n");
-  fprintf(stderr, " -eS     N = encoder scheme (default all)\n");
+  fprintf(stderr, " -eS     S = encoder schemes sparated by / (default all)\n");
  fprintf(stderr, " -tN     N = time in seconds per interation\n");
  fprintf(stderr, " -TN     N = Iterations (default 3)\n");
  fprintf(stderr, " -vN     N = verbosity 1..3\n");
  fprintf(stderr, "----- file specified --------------\n");
  fprintf(stderr, " -rN     N = max. file size to read\n");
-  fprintf(stderr, "Ex. ./icbench -c1 gov2.sorted\n");
+  fprintf(stderr, "Ex. ./icbench -c1 gov2.sorted -eturbopack/turbopfor\n");
  fprintf(stderr, "----- file not specified --------------\n");
  fprintf(stderr, " -aF     F = zipfian distribution alpha ex. -a1.0 uniform -a1.5 skewed\n");
  fprintf(stderr, " -mN     N = minimum integer generated in bits\n");
@ -567,7 +581,7 @@ int main(int argc, char *argv[]) { int r;
        print(totlen, s, NULL);
      }
    } else { 			// Benchmark w. generated data
-      printf("zipf alpha=%3.1f range[%u..%u].\nbit size histogramm: ", a, rm, rx); 
+      printf("zipf alpha=%3.1f range[%u..%u].\n ", a, rm, rx); 
      *in = n; 
      zipfgen(in+1, a, rm, rx, n);   							for(i = 1; i <= n; i++) xbits[bsr32(in[i])]++; stprint();
      if(mode>=0) { unsigned *ip=in+1; int v; for(v = 1; v < n; v++) { ip[v] += ip[v-1] + mode; if(ip[v]>(1u<<28)) die("overflow generating sorted array\n" );  } }
--- a/idxqry.c
+++ b/idxqry.c
@ -233,7 +233,7 @@ int postcmp(post_t *a, post_t *b) {

 int intersec_max;

-unsigned idxsearch(idxrd_t *idx, qry_t *q) {
+unsigned qrysearch(qry_t *q, idxrd_t *idx) {
  int f_t = 0, i;
  post_t   *p, *pe, post[TERMNUM]; 
  unsigned did, elim, dids[TERMNUM][BLK_DIDNUM+31];
@ -300,7 +300,7 @@ unsigned long long   qrybatch(idxrd_t *idx, char *fqname, int *qid) {
    }																								 														
    if(qry.terms >= temin && qry.terms <= temax) {  									//int j; for(j=0;j < qry.terms;j++) { if(j) printf(" "); printf("%u", qry.term[j]); }  printf(" %d \n", qry.terms);
      qry.id  = ++id;																	tex = max(qry.terms,tex);
-      f_t += idxsearch(idx, &qry); 														if(id >= qmax) break;
+      f_t += qrysearch(&qry, idx); 														if(id >= qmax) break;
    }
  }
  fclose(fq);
--- a/6
+++ b/6
@ -8,13 +8,13 @@ bitunpack.o: $(BIT)bitunpack.c $(BIT)bitunpack_.h $(BIT)bitunpack.h $(BIT)bitunp
 	gcc -O3 $(CFLAGS) -c $(BIT)bitunpack.c

 bitpack.o: $(BIT)bitpack.c $(BIT)bitpack_.h $(BIT)bitpack.h $(BIT)bitpack64_.h
-	gcc -O2 $(CFLAGS2) -c $(BIT)bitpack.c
+	gcc -O2 $(CFLAGS) -c $(BIT)bitpack.c

 vp4dc.o: $(BIT)vp4dc.c
-	gcc -O3 $(CFLAGS2) -funroll-loops -c $(BIT)vp4dc.c
+	gcc -O3 $(CFLAGS) -funroll-loops -c $(BIT)vp4dc.c

 vp4dd.o: $(BIT)vp4dd.c
-	gcc -O3 $(CFLAGS2) -funroll-loops -c $(BIT)vp4dd.c
+	gcc -O3 $(CFLAGS) -funroll-loops -c $(BIT)vp4dd.c

 SIMDCOMPD=ext/simdcomp/
 SIMDCOMP=$(SIMDCOMPD)bitpacka.o $(SIMDCOMPD)src/simdintegratedbitpacking.o $(SIMDCOMPD)src/simdcomputil.o $(SIMDCOMPD)src/simdbitpacking.o