diff --git a/idxcr.c b/idxcr.c index 37815f6..c54cfbd 100644 --- a/idxcr.c +++ b/idxcr.c @@ -1,5 +1,5 @@ /** - Copyright (C) powturbo 2013-2014 + Copyright (C) powturbo 2013-2015 GPL v2 License This program is free software; you can redistribute it and/or modify @@ -16,13 +16,12 @@ with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - email : powturbo [AT] gmail.com - - github : https://github.com/powturbo - homepage : https://sites.google.com/site/powturbo/ + - github : https://github.com/powturbo - twitter : https://twitter.com/powturbo - - idxcr.c - "Integer Compression" Create inverted index for using by idxqry for benchmarking -**/ + - email : powturbo [_AT_] gmail [_DOT_] com +**/ +// idxcr.c - "Integer Compression" Create inverted index for using by idxqry for benchmarking #define _LARGEFILE64_SOURCE 1 #define _FILE_OFFSET_BITS 64 #include @@ -30,31 +29,17 @@ #include #include #include -#include - +#include #include #include +#include "conf.h" #include "vint.h" #include "vp4dc.h" - #include "bitpack.h" #include "idx.h" -//-------------------------------------- Simdcomp -------------------------------------------------------------------------- -#include "ext/simdcomp/include/simdbitpacking.h" - -unsigned char *simdpackwn(uint32_t *in, uint32_t n, uint32_t b, uint32_t *out) { - uint32_t *in_; - for(in_ = in + n; in + 128 <= in_; in += 128, out += 4 * b) simdpackwithoutmask(in, (__m128i *)out, b); - return (unsigned char *)out; -} -unsigned char *simdpackwn1(uint32_t *in, uint32_t n, uint32_t b, uint32_t start, uint32_t *out) { - uint32_t *in_; - for(in_ = in + n; in + 128 <= in_; in += 128, out += 4 * b) simdpackwithoutmaskd1(start, in, (__m128i *)out, b); - return (unsigned char *)out; -} //--------------------------------------------------------------------------------------------------------------- -#define DELTA( __in, __n, __b) { unsigned _v; for(__b=0,_v = __n-1; _v > 0; --_v) __in[_v] = (__in[_v] - __in[_v-1]) - 1, __b |= __in[_v]; __b = bsr32(__b); } +#define DELTA( __in, __n, __b) do { unsigned _v; for(__b=0,_v = __n-1; _v > 0; --_v) __in[_v] = (__in[_v] - __in[_v-1]) - 1, __b |= __in[_v]; __b = bsr32(__b); } while(0) #define TERMNUM 2000000 int verb; @@ -66,6 +51,8 @@ void usage() { fprintf(stderr, "See http://lemire.me/data/integercompression2014.html'\n"); fprintf(stderr, "Usage: idxcr \n"); fprintf(stderr, "ex. idxcr clueweb09.sorted idxdir\n\n"); + fprintf(stderr, "ex. index partitions generated from idxseg\n\n"); + fprintf(stderr, "ex. idxcr ./idxcr gov2.sorted.s* .\n\n"); exit(-1); } @@ -75,13 +62,13 @@ int main(int argc, char *argv[]) { for(;;) { if((c = getopt_long(argc, argv, "xv:", long_options, &option_index)) == -1) break; switch(c) { - case 0 : printf("Option %s", long_options[option_index].name); if(optarg) printf (" with arg %s", optarg); printf ("\n"); break; + case 0 : printf("Option %s", long_options[option_index].name); + if(optarg) printf (" with arg %s", optarg); printf ("\n"); break; case 'v': verb = atoi(optarg); break; default: die("unknown option: %c \n", optopt); } } if(argc - optind < 2) usage(); - tmap_t *tmap = malloc(TERMNUM*sizeof(tmap_t)); if(!tmap) die("malloc error\n"); path = argv[--argc]; for(fno = optind; fno < argc; fno++) { @@ -92,62 +79,83 @@ int main(int argc, char *argv[]) { strcat(outname, p); strcat(outname,".i"); FILE *fi = fopen64(inname, "rb"); if(!fi) { fprintf(stderr, "open error '%s'", inname); perror(inname); exit(-1); } int fdi = fileno(fi); - FILE *fo = fopen64(outname,"wb"),*fm; if(!fo) { fprintf(stderr, "creat error '%s'", outname); perror(outname); exit(-1); } fprintf(stderr, "file='%s'", outname); + FILE *fo = fopen64(outname,"wb"),*fm; if(!fo) { fprintf(stderr, "creat error '%s'", outname); perror(outname); exit(-1); } fprintf(stderr, "file='%s'", outname); fseeko(fo, sizeof(unsigned)+sizeof(unsigned long long), SEEK_SET); + tmap_t *tmap = calloc(1, TERMNUM*sizeof(tmap_t)); + if(!tmap) die("malloc error\n"); unsigned *in = NULL,*ip,*ep,num,tid=0,numx=0,outsize; unsigned char *out = NULL; - unsigned long long fofs; + unsigned long long fofs; unsigned long long postsize=0,skipsize = 0; - while(fread(&num, 1, 4, fi) == 4 && num) { // read number of docid in term + while(fread(&num, 1, 4, fi) == 4) { // read number of docid in term + if(!num) { ++tid; continue; } unsigned bnum = (num+BLK_DIDNUM-1)/BLK_DIDNUM; - if(num > numx) { numx = num; - in = realloc(in, num*4+64); - outsize = num*4+bnum*sizeof(unsigned)*2+1024; - out = realloc(out, outsize); + if(num > numx) { + numx = num; + in = realloc(in, num*4+64); + outsize = num*4+bnum*sizeof(unsigned)*SKIP_SIZE+1024; + out = realloc(out, outsize); if(!in || !out) die("malloc err=%u", num); } - - if(fread(in, 4, num, fi) != num) break; // read docid list + if(fread(in, 4, num, fi) != num) break; // read docid list + unsigned char *op = out,*_op; - vbput(op, num); // store f_t - + vbput(op, num); // store f_t + unsigned *pix = (unsigned *)op; - if(num > BLK_DIDNUM) op += bnum*sizeof(unsigned)*2; + if(num > BLK_DIDNUM) { + op += bnum*sizeof(unsigned)*SKIP_SIZE; skipsize += op-out; + } for(_op = op, ip = in, ep = ip+num; ip < ep; ) { + unsigned n = min(ep-ip, BLK_DIDNUM), b = 0,bx; if(op+5*n > out+outsize) die("output buffer too small\n"); + if(n > 1) { + DELTA(ip, n, b); //bitdelta32( in+1, --n, pa, in[0], mode); + #ifdef _TURBOPFOR + b = p4d32(ip+1, n-1, &bx); + #endif + } + #ifdef SKIP_S + unsigned u = ip[0]< BLK_DIDNUM) { // skip/index. docid[0] and offset to compressed block - *pix = ip[0]; // First docid - pix[bnum] = op-_op; // offset + *pix = u; + #if SKIP_SIZE == 2 + pix[bnum] = op - _op; // save posting offset + #endif pix++; - } else vbput(op, ip[0]); // skip not needed - - unsigned n = min(ep-ip, BLK_DIDNUM),b=0; if(op+5*n > out+outsize) die("output buffer too small\n"); + } else vbput(op, u); // skip not needed + if(n > 1) { - DELTA(ip, n, b); - #ifdef USE_SIMDPACK - if(n < 129) { *op++ = b; op = bitpack32( ip+1, n-1, b, op); } //op = vbenc(ip+1, n-1, op); - else { *op++ = b; op = simdpackwn(ip+1, n-1, b, (unsigned *)op); } - #elif defined(USE_TURBOPFOR) - op = p4denc32( ip+1, n-1, op); - #else - *op++ = b; op = bitpack32(ip+1, n-1, b, op); + #ifndef SKIP_S + *op++ = b; + #endif + #ifdef _TURBOPFOR + *op++ = bx; + op = n==129?p4dev32( ip+1, n-1, op, b, bx):p4de32( ip+1, n-1, op, b, bx); + #else + op = n==129?bitpackv32( ip+1, n-1, op, b) :bitpack32(ip+1, n-1, op, b); #endif } ip += n; - } + } fofs = ftello(fo); tmap_t *t = &tmap[tid++]; TIDMAPSET(t, fofs); - if(fwrite(out, 1, op-out, fo) < 0) die("fwrite error\n"); + if(fwrite(out, 1, op-out, fo) < 0) die("fwrite error\n"); postsize += op-out; } fofs = ftello(fo); // write termmap if(fwrite(tmap, 1, tid*sizeof(tmap_t), fo) < 0) die("fwrite error\n"); fseeko(fo, 0, SEEK_SET); if(fwrite(&fofs, 1, sizeof(unsigned long long), fo) < 0) die("fwrite error\n"); - if(fwrite(&tid, 1, sizeof(unsigned), fo) < 0) die("fwrite error\n"); - + if(fwrite(&tid, 1, sizeof(unsigned), fo) < 0) die("fwrite error\n"); fofs = ftello(fi); fclose(fi); fclose(fo); if(in) { free(in); free(out); } + free(tmap); + printf("\nterms=%u size=[tmap=%u skip=%llu post=%llu total=%llu, ratio=%.2f\%\n", tid, (unsigned)(tid*sizeof(tmap_t)), skipsize, postsize-skipsize, postsize+tid*sizeof(tmap_t)+12, (double)postsize*100/(double)fofs ); } }