.
This commit is contained in:
116
idxcr.c
116
idxcr.c
@ -1,5 +1,5 @@
|
|||||||
/**
|
/**
|
||||||
Copyright (C) powturbo 2013-2014
|
Copyright (C) powturbo 2013-2015
|
||||||
GPL v2 License
|
GPL v2 License
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
This program is free software; you can redistribute it and/or modify
|
||||||
@ -16,13 +16,12 @@
|
|||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
- email : powturbo [AT] gmail.com
|
|
||||||
- github : https://github.com/powturbo
|
|
||||||
- homepage : https://sites.google.com/site/powturbo/
|
- homepage : https://sites.google.com/site/powturbo/
|
||||||
|
- github : https://github.com/powturbo
|
||||||
- twitter : https://twitter.com/powturbo
|
- twitter : https://twitter.com/powturbo
|
||||||
|
- email : powturbo [_AT_] gmail [_DOT_] com
|
||||||
idxcr.c - "Integer Compression" Create inverted index for using by idxqry for benchmarking
|
**/
|
||||||
**/
|
// idxcr.c - "Integer Compression" Create inverted index for using by idxqry for benchmarking
|
||||||
#define _LARGEFILE64_SOURCE 1
|
#define _LARGEFILE64_SOURCE 1
|
||||||
#define _FILE_OFFSET_BITS 64
|
#define _FILE_OFFSET_BITS 64
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
@ -30,31 +29,17 @@
|
|||||||
#include <malloc.h>
|
#include <malloc.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
#include <limits.h>
|
#include <limits.h>
|
||||||
|
|
||||||
#include <getopt.h>
|
#include <getopt.h>
|
||||||
#include <sys/stat.h>
|
#include <sys/stat.h>
|
||||||
|
|
||||||
|
#include "conf.h"
|
||||||
#include "vint.h"
|
#include "vint.h"
|
||||||
#include "vp4dc.h"
|
#include "vp4dc.h"
|
||||||
|
|
||||||
#include "bitpack.h"
|
#include "bitpack.h"
|
||||||
#include "idx.h"
|
#include "idx.h"
|
||||||
//-------------------------------------- Simdcomp --------------------------------------------------------------------------
|
|
||||||
#include "ext/simdcomp/include/simdbitpacking.h"
|
|
||||||
|
|
||||||
unsigned char *simdpackwn(uint32_t *in, uint32_t n, uint32_t b, uint32_t *out) {
|
|
||||||
uint32_t *in_;
|
|
||||||
for(in_ = in + n; in + 128 <= in_; in += 128, out += 4 * b) simdpackwithoutmask(in, (__m128i *)out, b);
|
|
||||||
return (unsigned char *)out;
|
|
||||||
}
|
|
||||||
unsigned char *simdpackwn1(uint32_t *in, uint32_t n, uint32_t b, uint32_t start, uint32_t *out) {
|
|
||||||
uint32_t *in_;
|
|
||||||
for(in_ = in + n; in + 128 <= in_; in += 128, out += 4 * b) simdpackwithoutmaskd1(start, in, (__m128i *)out, b);
|
|
||||||
return (unsigned char *)out;
|
|
||||||
}
|
|
||||||
//---------------------------------------------------------------------------------------------------------------
|
//---------------------------------------------------------------------------------------------------------------
|
||||||
#define DELTA( __in, __n, __b) { unsigned _v; for(__b=0,_v = __n-1; _v > 0; --_v) __in[_v] = (__in[_v] - __in[_v-1]) - 1, __b |= __in[_v]; __b = bsr32(__b); }
|
#define DELTA( __in, __n, __b) do { unsigned _v; for(__b=0,_v = __n-1; _v > 0; --_v) __in[_v] = (__in[_v] - __in[_v-1]) - 1, __b |= __in[_v]; __b = bsr32(__b); } while(0)
|
||||||
|
|
||||||
#define TERMNUM 2000000
|
#define TERMNUM 2000000
|
||||||
int verb;
|
int verb;
|
||||||
@ -66,6 +51,8 @@ void usage() {
|
|||||||
fprintf(stderr, "See http://lemire.me/data/integercompression2014.html'\n");
|
fprintf(stderr, "See http://lemire.me/data/integercompression2014.html'\n");
|
||||||
fprintf(stderr, "Usage: idxcr <docid file> <destination dir>\n");
|
fprintf(stderr, "Usage: idxcr <docid file> <destination dir>\n");
|
||||||
fprintf(stderr, "ex. idxcr clueweb09.sorted idxdir\n\n");
|
fprintf(stderr, "ex. idxcr clueweb09.sorted idxdir\n\n");
|
||||||
|
fprintf(stderr, "ex. index partitions generated from idxseg\n\n");
|
||||||
|
fprintf(stderr, "ex. idxcr ./idxcr gov2.sorted.s* .\n\n");
|
||||||
exit(-1);
|
exit(-1);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -75,13 +62,13 @@ int main(int argc, char *argv[]) {
|
|||||||
for(;;) {
|
for(;;) {
|
||||||
if((c = getopt_long(argc, argv, "xv:", long_options, &option_index)) == -1) break;
|
if((c = getopt_long(argc, argv, "xv:", long_options, &option_index)) == -1) break;
|
||||||
switch(c) {
|
switch(c) {
|
||||||
case 0 : printf("Option %s", long_options[option_index].name); if(optarg) printf (" with arg %s", optarg); printf ("\n"); break;
|
case 0 : printf("Option %s", long_options[option_index].name);
|
||||||
|
if(optarg) printf (" with arg %s", optarg); printf ("\n"); break;
|
||||||
case 'v': verb = atoi(optarg); break;
|
case 'v': verb = atoi(optarg); break;
|
||||||
default: die("unknown option: %c \n", optopt);
|
default: die("unknown option: %c \n", optopt);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if(argc - optind < 2) usage();
|
if(argc - optind < 2) usage();
|
||||||
tmap_t *tmap = malloc(TERMNUM*sizeof(tmap_t)); if(!tmap) die("malloc error\n");
|
|
||||||
path = argv[--argc];
|
path = argv[--argc];
|
||||||
|
|
||||||
for(fno = optind; fno < argc; fno++) {
|
for(fno = optind; fno < argc; fno++) {
|
||||||
@ -92,62 +79,83 @@ int main(int argc, char *argv[]) {
|
|||||||
strcat(outname, p); strcat(outname,".i");
|
strcat(outname, p); strcat(outname,".i");
|
||||||
|
|
||||||
FILE *fi = fopen64(inname, "rb"); if(!fi) { fprintf(stderr, "open error '%s'", inname); perror(inname); exit(-1); } int fdi = fileno(fi);
|
FILE *fi = fopen64(inname, "rb"); if(!fi) { fprintf(stderr, "open error '%s'", inname); perror(inname); exit(-1); } int fdi = fileno(fi);
|
||||||
FILE *fo = fopen64(outname,"wb"),*fm; if(!fo) { fprintf(stderr, "creat error '%s'", outname); perror(outname); exit(-1); } fprintf(stderr, "file='%s'", outname);
|
FILE *fo = fopen64(outname,"wb"),*fm; if(!fo) { fprintf(stderr, "creat error '%s'", outname); perror(outname); exit(-1); } fprintf(stderr, "file='%s'", outname);
|
||||||
fseeko(fo, sizeof(unsigned)+sizeof(unsigned long long), SEEK_SET);
|
fseeko(fo, sizeof(unsigned)+sizeof(unsigned long long), SEEK_SET);
|
||||||
|
|
||||||
|
tmap_t *tmap = calloc(1, TERMNUM*sizeof(tmap_t));
|
||||||
|
if(!tmap) die("malloc error\n");
|
||||||
unsigned *in = NULL,*ip,*ep,num,tid=0,numx=0,outsize;
|
unsigned *in = NULL,*ip,*ep,num,tid=0,numx=0,outsize;
|
||||||
unsigned char *out = NULL;
|
unsigned char *out = NULL;
|
||||||
unsigned long long fofs;
|
unsigned long long fofs; unsigned long long postsize=0,skipsize = 0;
|
||||||
|
|
||||||
while(fread(&num, 1, 4, fi) == 4 && num) { // read number of docid in term
|
while(fread(&num, 1, 4, fi) == 4) { // read number of docid in term
|
||||||
|
if(!num) { ++tid; continue; }
|
||||||
unsigned bnum = (num+BLK_DIDNUM-1)/BLK_DIDNUM;
|
unsigned bnum = (num+BLK_DIDNUM-1)/BLK_DIDNUM;
|
||||||
if(num > numx) { numx = num;
|
if(num > numx) {
|
||||||
in = realloc(in, num*4+64);
|
numx = num;
|
||||||
outsize = num*4+bnum*sizeof(unsigned)*2+1024;
|
in = realloc(in, num*4+64);
|
||||||
out = realloc(out, outsize);
|
outsize = num*4+bnum*sizeof(unsigned)*SKIP_SIZE+1024;
|
||||||
|
out = realloc(out, outsize);
|
||||||
if(!in || !out) die("malloc err=%u", num);
|
if(!in || !out) die("malloc err=%u", num);
|
||||||
}
|
}
|
||||||
|
if(fread(in, 4, num, fi) != num) break; // read docid list
|
||||||
if(fread(in, 4, num, fi) != num) break; // read docid list
|
|
||||||
unsigned char *op = out,*_op;
|
unsigned char *op = out,*_op;
|
||||||
vbput(op, num); // store f_t
|
vbput(op, num); // store f_t
|
||||||
|
|
||||||
unsigned *pix = (unsigned *)op;
|
unsigned *pix = (unsigned *)op;
|
||||||
if(num > BLK_DIDNUM) op += bnum*sizeof(unsigned)*2;
|
if(num > BLK_DIDNUM) {
|
||||||
|
op += bnum*sizeof(unsigned)*SKIP_SIZE; skipsize += op-out;
|
||||||
|
}
|
||||||
for(_op = op, ip = in, ep = ip+num; ip < ep; ) {
|
for(_op = op, ip = in, ep = ip+num; ip < ep; ) {
|
||||||
|
unsigned n = min(ep-ip, BLK_DIDNUM), b = 0,bx; if(op+5*n > out+outsize) die("output buffer too small\n");
|
||||||
|
if(n > 1) {
|
||||||
|
DELTA(ip, n, b); //bitdelta32( in+1, --n, pa, in[0], mode);
|
||||||
|
#ifdef _TURBOPFOR
|
||||||
|
b = p4d32(ip+1, n-1, &bx);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
#ifdef SKIP_S
|
||||||
|
unsigned u = ip[0]<<SKIP_S|b; // save width
|
||||||
|
#else
|
||||||
|
unsigned u = ip[0]; // First docid
|
||||||
|
#endif
|
||||||
|
|
||||||
if(num > BLK_DIDNUM) { // skip/index. docid[0] and offset to compressed block
|
if(num > BLK_DIDNUM) { // skip/index. docid[0] and offset to compressed block
|
||||||
*pix = ip[0]; // First docid
|
*pix = u;
|
||||||
pix[bnum] = op-_op; // offset
|
#if SKIP_SIZE == 2
|
||||||
|
pix[bnum] = op - _op; // save posting offset
|
||||||
|
#endif
|
||||||
pix++;
|
pix++;
|
||||||
} else vbput(op, ip[0]); // skip not needed
|
} else vbput(op, u); // skip not needed
|
||||||
|
|
||||||
unsigned n = min(ep-ip, BLK_DIDNUM),b=0; if(op+5*n > out+outsize) die("output buffer too small\n");
|
|
||||||
if(n > 1) {
|
if(n > 1) {
|
||||||
DELTA(ip, n, b);
|
#ifndef SKIP_S
|
||||||
#ifdef USE_SIMDPACK
|
*op++ = b;
|
||||||
if(n < 129) { *op++ = b; op = bitpack32( ip+1, n-1, b, op); } //op = vbenc(ip+1, n-1, op);
|
#endif
|
||||||
else { *op++ = b; op = simdpackwn(ip+1, n-1, b, (unsigned *)op); }
|
#ifdef _TURBOPFOR
|
||||||
#elif defined(USE_TURBOPFOR)
|
*op++ = bx;
|
||||||
op = p4denc32( ip+1, n-1, op);
|
op = n==129?p4dev32( ip+1, n-1, op, b, bx):p4de32( ip+1, n-1, op, b, bx);
|
||||||
#else
|
#else
|
||||||
*op++ = b; op = bitpack32(ip+1, n-1, b, op);
|
op = n==129?bitpackv32( ip+1, n-1, op, b) :bitpack32(ip+1, n-1, op, b);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
ip += n;
|
ip += n;
|
||||||
}
|
}
|
||||||
fofs = ftello(fo);
|
fofs = ftello(fo);
|
||||||
tmap_t *t = &tmap[tid++];
|
tmap_t *t = &tmap[tid++];
|
||||||
TIDMAPSET(t, fofs);
|
TIDMAPSET(t, fofs);
|
||||||
if(fwrite(out, 1, op-out, fo) < 0) die("fwrite error\n");
|
if(fwrite(out, 1, op-out, fo) < 0) die("fwrite error\n"); postsize += op-out;
|
||||||
}
|
}
|
||||||
fofs = ftello(fo); // write termmap
|
fofs = ftello(fo); // write termmap
|
||||||
if(fwrite(tmap, 1, tid*sizeof(tmap_t), fo) < 0) die("fwrite error\n");
|
if(fwrite(tmap, 1, tid*sizeof(tmap_t), fo) < 0) die("fwrite error\n");
|
||||||
|
|
||||||
fseeko(fo, 0, SEEK_SET);
|
fseeko(fo, 0, SEEK_SET);
|
||||||
if(fwrite(&fofs, 1, sizeof(unsigned long long), fo) < 0) die("fwrite error\n");
|
if(fwrite(&fofs, 1, sizeof(unsigned long long), fo) < 0) die("fwrite error\n");
|
||||||
if(fwrite(&tid, 1, sizeof(unsigned), fo) < 0) die("fwrite error\n");
|
if(fwrite(&tid, 1, sizeof(unsigned), fo) < 0) die("fwrite error\n"); fofs = ftello(fi);
|
||||||
|
|
||||||
fclose(fi); fclose(fo);
|
fclose(fi); fclose(fo);
|
||||||
if(in) { free(in); free(out); }
|
if(in) { free(in); free(out); }
|
||||||
|
free(tmap);
|
||||||
|
printf("\nterms=%u size=[tmap=%u skip=%llu post=%llu total=%llu, ratio=%.2f\%\n", tid, (unsigned)(tid*sizeof(tmap_t)), skipsize, postsize-skipsize, postsize+tid*sizeof(tmap_t)+12, (double)postsize*100/(double)fofs );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user