bm25: build index(inverted_list, parallel_building), tokenizer for (docs & search).
(cherry picked from commit ea882f08feabb5d386cde15a828581bc3665429b)
This commit is contained in:
@ -64,6 +64,7 @@ if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/chparser)
|
||||
endif()
|
||||
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/age)
|
||||
add_subdirectory(age)
|
||||
endif()
|
||||
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/jieba_tokenizer)
|
||||
add_subdirectory(jieba_tokenizer)
|
||||
endif()
|
||||
|
||||
@ -30,7 +30,7 @@
|
||||
#include "tokenizer.h"
|
||||
|
||||
const size_t MAX_LENGTH_CRC = 100;
|
||||
const size_t MAX_KEYWORD_NUM = 100;
|
||||
const size_t MAX_KEYWORD_NUM = 100000;
|
||||
const size_t MAX_PATH_LEN = 1024;
|
||||
|
||||
const char* const DICT_PATH = "lib/jieba_dict/jieba.dict.utf8";
|
||||
@ -66,22 +66,31 @@ inline static uint32_t HashString2Uint32(const std::string& srcStr)
|
||||
return crc;
|
||||
}
|
||||
|
||||
inline static void ConvertEmbeddingMap(std::unordered_map<uint32_t, float> tempMap, EmbeddingMap *embeddingMap)
|
||||
inline static void ConvertEmbeddingMap(std::unordered_map<std::string, std::pair<uint32_t, float>> tokensMap,
|
||||
EmbeddingMap *embeddingMap)
|
||||
{
|
||||
embeddingMap->size = tempMap.size();
|
||||
embeddingMap->size = tokensMap.size();
|
||||
if (embeddingMap->size == 0) {
|
||||
return;
|
||||
}
|
||||
embeddingMap->pairs = (EmbeddingPair *)malloc(embeddingMap->size * sizeof(EmbeddingPair));
|
||||
if (embeddingMap->pairs == nullptr) {
|
||||
embeddingMap->tokens = (EmbeddingTokenInfo *)malloc(embeddingMap->size * sizeof(EmbeddingTokenInfo));
|
||||
if (embeddingMap->tokens == nullptr) {
|
||||
embeddingMap->size = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
size_t index = 0;
|
||||
for (const auto& pair : tempMap) {
|
||||
embeddingMap->pairs[index].key = pair.first;
|
||||
embeddingMap->pairs[index].value = pair.second;
|
||||
index++;
|
||||
size_t idx = 0;
|
||||
for (const auto& token : tokensMap) {
|
||||
embeddingMap->tokens[idx].key = token.second.first;
|
||||
embeddingMap->tokens[idx].value = token.second.second;
|
||||
errno_t rc = strncpy_s(embeddingMap->tokens[idx].token, MAX_TOKEN_LEN, token.first.c_str(), MAX_TOKEN_LEN - 1);
|
||||
if (rc != EOK) {
|
||||
free(embeddingMap->tokens);
|
||||
embeddingMap->tokens = nullptr;
|
||||
embeddingMap->size = 0;
|
||||
return;
|
||||
}
|
||||
idx++;
|
||||
}
|
||||
}
|
||||
|
||||
@ -148,31 +157,36 @@ bool ConvertString2Embedding(const char* srcStr, EmbeddingMap *embeddingMap, boo
|
||||
if (jiebaTokenizer == nullptr || srcStr == nullptr || embeddingMap == nullptr) {
|
||||
return false;
|
||||
}
|
||||
std::string srcStrInput(srcStr);
|
||||
std::unordered_map<uint32_t, float> tempMap;
|
||||
std::string sentence(srcStr);
|
||||
std::unordered_map<std::string, std::pair<uint32_t, float>> tokensMap;
|
||||
if (isKeywordExtractor) {
|
||||
std::vector<cppjieba::KeywordExtractor::Word> keywords;
|
||||
jiebaTokenizer->extractor.Extract(srcStrInput, keywords, MAX_KEYWORD_NUM);
|
||||
jiebaTokenizer->extractor.Extract(sentence, keywords, MAX_KEYWORD_NUM);
|
||||
for (const auto& keyword : keywords) {
|
||||
uint32_t hashValue = HashString2Uint32(Convert2LowerCase(keyword.word));
|
||||
tempMap[hashValue] += keyword.weight;
|
||||
tokensMap[keyword.word] = std::make_pair(hashValue, keyword.weight);
|
||||
}
|
||||
if (!tempMap.empty()) {
|
||||
ConvertEmbeddingMap(tempMap, embeddingMap);
|
||||
if (!tokensMap.empty()) {
|
||||
ConvertEmbeddingMap(tokensMap, embeddingMap);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// if the keywords extracted by 'Extract' are empty, then use 'Cut' for tokenization.
|
||||
std::vector<std::string> tokens;
|
||||
jiebaTokenizer->Cut(srcStrInput, tokens, true);
|
||||
jiebaTokenizer->Cut(sentence, tokens, true);
|
||||
for (const auto& token : tokens) {
|
||||
if (IsWhitespace(token)) {
|
||||
continue;
|
||||
}
|
||||
uint32_t hashValue = HashString2Uint32(Convert2LowerCase(token));
|
||||
tempMap[hashValue] += 1.0f;
|
||||
if (tokensMap.find(token) == tokensMap.end()) {
|
||||
tokensMap[token] = std::make_pair(hashValue, 1.0f);
|
||||
} else {
|
||||
tokensMap[token].second += 1.0f;
|
||||
}
|
||||
}
|
||||
ConvertEmbeddingMap(tempMap, embeddingMap);
|
||||
ConvertEmbeddingMap(tokensMap, embeddingMap);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@ -27,17 +27,20 @@
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#define MAX_TOKEN_LEN 100
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct EmbeddingPair {
|
||||
typedef struct EmbeddingTokenInfo {
|
||||
uint32_t key;
|
||||
float value;
|
||||
} EmbeddingPair;
|
||||
char token[MAX_TOKEN_LEN];
|
||||
} EmbeddingTokenInfo;
|
||||
|
||||
typedef struct {
|
||||
EmbeddingPair *pairs;
|
||||
EmbeddingTokenInfo *tokens;
|
||||
size_t size;
|
||||
} EmbeddingMap;
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -32,11 +32,133 @@
|
||||
#include "access/heapam.h"
|
||||
#include "catalog/index.h"
|
||||
#include "access/tableam.h"
|
||||
#include "db4ai/bayesnet.h"
|
||||
#include "access/datavec/bm25.h"
|
||||
|
||||
typedef struct BM25QueryToken {
|
||||
BlockNumber tokenPostingBlock;
|
||||
float qTokenMaxScore;
|
||||
float qTokenIDFVal;
|
||||
} BM25QueryToken;
|
||||
|
||||
typedef struct BM25QueryTokensInfo {
|
||||
BM25QueryToken *queryTokens;
|
||||
uint32 size;
|
||||
} BM25QueryTokensInfo;
|
||||
|
||||
static void FindBucketsLocation(Page page, BM25TokenizedDocData &tokenizedQuery, BlockNumber *bucketsLocation, uint32 &bucketFoundCount)
|
||||
{
|
||||
OffsetNumber maxoffno = PageGetMaxOffsetNumber(page);
|
||||
for (OffsetNumber offnoBucket = FirstOffsetNumber; offnoBucket <= maxoffno; offnoBucket++) {
|
||||
BM25HashBucketPage bucket = (BM25HashBucketPage)PageGetItem(page, PageGetItemId(page, offnoBucket));
|
||||
for (size_t tokenIdx = 0; tokenIdx < tokenizedQuery.tokenCount; tokenIdx++) {
|
||||
if (bucketsLocation[tokenIdx] == InvalidBlockNumber &&
|
||||
bucket->bucketId == (tokenizedQuery.tokenDatas[tokenIdx].hashValue % BM25_BUCKET_MAX_NUM)) {
|
||||
bucketsLocation[tokenIdx] = bucket->bucketBlkno;
|
||||
bucketFoundCount++;
|
||||
if (bucketFoundCount >= tokenizedQuery.tokenCount)
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
static void FindTokenInfo(BM25MetaPageData &meta, Page page, BM25TokenizedDocData &tokenizedQuery,
|
||||
BM25QueryToken *queryTokens, size_t tokenIdx, uint32 &tokenFoundCount)
|
||||
{
|
||||
OffsetNumber maxoffno = PageGetMaxOffsetNumber(page);
|
||||
for (OffsetNumber offnoTokenMeta = FirstOffsetNumber; offnoTokenMeta <= maxoffno; offnoTokenMeta++) {
|
||||
BM25TokenMetaPage tokenMeta = (BM25TokenMetaPage)PageGetItem(page, PageGetItemId(page, offnoTokenMeta));
|
||||
if (strncmp(tokenMeta->token, tokenizedQuery.tokenDatas[tokenIdx].tokenValue, BM25_MAX_TOKEN_LEN - 1) == 0) {
|
||||
queryTokens[tokenIdx].qTokenMaxScore = tokenMeta->maxScore;
|
||||
queryTokens[tokenIdx].tokenPostingBlock = tokenMeta->postingBlkno;
|
||||
queryTokens[tokenIdx].qTokenIDFVal = tokenizedQuery.tokenDatas[tokenIdx].tokenFreq *
|
||||
std::log((1 + ((float)meta.documentCount - (float)tokenMeta->docCount + 0.5) / ((float)tokenMeta->docCount + 0.5)));
|
||||
tokenFoundCount++;
|
||||
if (tokenFoundCount >= tokenizedQuery.tokenCount)
|
||||
return;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
static BM25QueryTokensInfo GetQueryTokens(Relation index, const char* sentence)
|
||||
{
|
||||
BM25TokenizedDocData tokenizedQuery = BM25DocumentTokenize(sentence);
|
||||
if (tokenizedQuery.tokenCount == 0) {
|
||||
BM25QueryTokensInfo emptyTokensInfo{0};
|
||||
emptyTokensInfo.queryTokens = nullptr;
|
||||
emptyTokensInfo.size = 0;
|
||||
return emptyTokensInfo;
|
||||
}
|
||||
BM25QueryToken *queryTokens = (BM25QueryToken*)palloc0(sizeof(BM25QueryToken) * tokenizedQuery.tokenCount);
|
||||
BlockNumber *bucketsLocation = (BlockNumber*)palloc0(sizeof(BlockNumber) * tokenizedQuery.tokenCount);
|
||||
for (size_t tokenIdx = 0; tokenIdx < tokenizedQuery.tokenCount; tokenIdx++) {
|
||||
queryTokens[tokenIdx].tokenPostingBlock = InvalidBlockNumber;
|
||||
bucketsLocation[tokenIdx] = InvalidBlockNumber;
|
||||
}
|
||||
|
||||
/* scan index for queryToken info */
|
||||
uint32 bucketFoundCount = 0;
|
||||
BM25MetaPageData meta;
|
||||
BM25GetMetaPageInfo(index, &meta);
|
||||
BlockNumber hashBucketsBlkno = meta.entryPageList.hashBucketsPage;
|
||||
BlockNumber nextHashBucketsBlkno = hashBucketsBlkno;
|
||||
Buffer cHashBucketsbuf;
|
||||
Page cHashBucketspage;
|
||||
|
||||
while (bucketFoundCount < tokenizedQuery.tokenCount && BlockNumberIsValid(nextHashBucketsBlkno)) {
|
||||
OffsetNumber maxoffno;
|
||||
cHashBucketsbuf = ReadBuffer(index, nextHashBucketsBlkno);
|
||||
LockBuffer(cHashBucketsbuf, BUFFER_LOCK_SHARE);
|
||||
cHashBucketspage = BufferGetPage(cHashBucketsbuf);
|
||||
FindBucketsLocation(cHashBucketspage, tokenizedQuery, bucketsLocation, bucketFoundCount);
|
||||
nextHashBucketsBlkno = BM25PageGetOpaque(cHashBucketspage)->nextblkno;
|
||||
UnlockReleaseBuffer(cHashBucketsbuf);
|
||||
}
|
||||
|
||||
uint32 tokenFoundCount = 0;
|
||||
for (size_t tokenIdx = 0; tokenIdx < tokenizedQuery.tokenCount; tokenIdx++) {
|
||||
if (!BlockNumberIsValid(bucketsLocation[tokenIdx])) {
|
||||
continue;
|
||||
}
|
||||
Buffer cTokenMetasbuf;
|
||||
Page cTokenMetaspage;
|
||||
BlockNumber nextTokenMetasBlkno = bucketsLocation[tokenIdx];
|
||||
while (tokenFoundCount < tokenizedQuery.tokenCount && BlockNumberIsValid(nextTokenMetasBlkno)) {
|
||||
OffsetNumber maxoffno;
|
||||
cTokenMetasbuf = ReadBuffer(index, nextTokenMetasBlkno);
|
||||
LockBuffer(cTokenMetasbuf, BUFFER_LOCK_SHARE);
|
||||
cTokenMetaspage = BufferGetPage(cTokenMetasbuf);
|
||||
FindTokenInfo(meta, cTokenMetaspage, tokenizedQuery, queryTokens, tokenIdx, tokenFoundCount);
|
||||
nextTokenMetasBlkno = BM25PageGetOpaque(cTokenMetaspage)->nextblkno;
|
||||
UnlockReleaseBuffer(cTokenMetasbuf);
|
||||
}
|
||||
}
|
||||
BM25QueryToken *resQueryTokens = (BM25QueryToken*)palloc0(sizeof(BM25QueryToken) * tokenFoundCount);
|
||||
uint32 tokenFillIdx = 0;
|
||||
for (size_t tokenIdx = 0; tokenIdx < tokenizedQuery.tokenCount; tokenIdx++) {
|
||||
if (!BlockNumberIsValid(queryTokens[tokenIdx].tokenPostingBlock)) {
|
||||
continue;
|
||||
}
|
||||
resQueryTokens[tokenFillIdx] = queryTokens[tokenIdx];
|
||||
tokenFillIdx++;
|
||||
if (tokenFillIdx >= tokenFoundCount) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
pfree(queryTokens);
|
||||
pfree(bucketsLocation);
|
||||
BM25QueryTokensInfo tokensInfo{0};
|
||||
tokensInfo.queryTokens = resQueryTokens;
|
||||
tokensInfo.size = tokenFoundCount;
|
||||
return tokensInfo;
|
||||
}
|
||||
|
||||
IndexScanDesc bm25beginscan_internal(Relation index, int nkeys, int norderbys)
|
||||
{
|
||||
IndexScanDesc scan;
|
||||
IndexScanDesc scan = RelationGetIndexScan(index, nkeys, norderbys);
|
||||
return scan;
|
||||
}
|
||||
|
||||
|
||||
@ -31,6 +31,109 @@
|
||||
#include "access/datavec/bm25.h"
|
||||
#include "access/datavec/utils.h"
|
||||
#include "storage/buf/bufmgr.h"
|
||||
#include "tokenizer.h"
|
||||
|
||||
slock_t newBufferMutex;
|
||||
|
||||
BM25TokenizedDocData BM25DocumentTokenize(const char* doc)
|
||||
{
|
||||
uint32 docLength = 0;
|
||||
EmbeddingMap embeddingMap{0};
|
||||
ConvertString2Embedding(doc, &embeddingMap, true);
|
||||
BM25TokenizedDocData tokenizedData = {};
|
||||
BM25TokenData* tokenDatas = (BM25TokenData*)palloc0(sizeof(BM25TokenData) * embeddingMap.size);
|
||||
for (size_t idx = 0; idx < embeddingMap.size; idx++) {
|
||||
tokenDatas[idx].hashValue = embeddingMap.tokens[idx].key;
|
||||
tokenDatas[idx].tokenFreq = embeddingMap.tokens[idx].value;
|
||||
errno_t rc = strncpy_s(tokenDatas[idx].tokenValue, BM25_MAX_TOKEN_LEN, embeddingMap.tokens[idx].token,
|
||||
BM25_MAX_TOKEN_LEN - 1);
|
||||
if (rc != EOK) {
|
||||
pfree(tokenDatas);
|
||||
tokenDatas = nullptr;
|
||||
docLength = 0;
|
||||
embeddingMap.size = 0;
|
||||
break;
|
||||
}
|
||||
tokenDatas[idx].tokenId = 0;
|
||||
docLength += embeddingMap.tokens[idx].value;
|
||||
}
|
||||
tokenizedData.tokenDatas = tokenDatas;
|
||||
tokenizedData.tokenCount = embeddingMap.size;
|
||||
tokenizedData.docLength = docLength;
|
||||
if (embeddingMap.tokens != nullptr) {
|
||||
free(embeddingMap.tokens);
|
||||
embeddingMap.tokens = nullptr;
|
||||
}
|
||||
return tokenizedData;
|
||||
}
|
||||
|
||||
/*
|
||||
* New buffer
|
||||
*/
|
||||
Buffer BM25NewBuffer(Relation index, ForkNumber forkNum)
|
||||
{
|
||||
SpinLockAcquire(&newBufferMutex);
|
||||
Buffer buf = ReadBufferExtended(index, forkNum, P_NEW, RBM_NORMAL, NULL);
|
||||
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
|
||||
SpinLockRelease(&newBufferMutex);
|
||||
return buf;
|
||||
}
|
||||
|
||||
/*
|
||||
* Init page
|
||||
*/
|
||||
void BM25InitPage(Buffer buf, Page page)
|
||||
{
|
||||
PageInit(page, BufferGetPageSize(buf), sizeof(BM25PageOpaqueData));
|
||||
BM25PageGetOpaque(page)->nextblkno = InvalidBlockNumber;
|
||||
BM25PageGetOpaque(page)->page_id = BM25_PAGE_ID;
|
||||
}
|
||||
|
||||
/*
|
||||
* Init and register page
|
||||
*/
|
||||
void BM25InitRegisterPage(Relation index, Buffer *buf, Page *page, GenericXLogState **state)
|
||||
{
|
||||
*state = GenericXLogStart(index);
|
||||
*page = GenericXLogRegisterBuffer(*state, *buf, GENERIC_XLOG_FULL_IMAGE);
|
||||
BM25InitPage(*buf, *page);
|
||||
}
|
||||
|
||||
/*
|
||||
* Commit buffer
|
||||
*/
|
||||
void BM25CommitBuffer(Buffer buf, GenericXLogState *state)
|
||||
{
|
||||
GenericXLogFinish(state);
|
||||
UnlockReleaseBuffer(buf);
|
||||
}
|
||||
|
||||
/*
|
||||
* Add a new page
|
||||
*
|
||||
* The order is very important!!
|
||||
*/
|
||||
void BM25AppendPage(Relation index, Buffer *buf, Page *page, ForkNumber forkNum, bool unlockOldBuf)
|
||||
{
|
||||
/* Get new buffer */
|
||||
Buffer newbuf = BM25NewBuffer(index, forkNum);
|
||||
Page newpage = BufferGetPage(newbuf);
|
||||
|
||||
/* Update the previous buffer */
|
||||
BM25PageGetOpaque(*page)->nextblkno = BufferGetBlockNumber(newbuf);
|
||||
|
||||
/* Init new page */
|
||||
BM25InitPage(newbuf, newpage);
|
||||
|
||||
MarkBufferDirty(*buf);
|
||||
if (unlockOldBuf) {
|
||||
/* Unlock */
|
||||
UnlockReleaseBuffer(*buf);
|
||||
}
|
||||
|
||||
*page = BufferGetPage(newbuf);
|
||||
*buf = newbuf;
|
||||
}
|
||||
|
||||
/*
|
||||
* Get the metapage info
|
||||
@ -58,18 +161,17 @@ uint32 BM25AllocateDocId(Relation index)
|
||||
Page page;
|
||||
BM25MetaPage metapBuf;
|
||||
uint32 docId;
|
||||
GenericXLogState *state;
|
||||
|
||||
buf = ReadBuffer(index, BM25_METAPAGE_BLKNO);
|
||||
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
|
||||
state = GenericXLogStart(index);
|
||||
page = GenericXLogRegisterBuffer(state, buf, 0);
|
||||
page = BufferGetPage(buf);
|
||||
metapBuf = BM25PageGetMeta(page);
|
||||
if (unlikely(metapBuf->magicNumber != BM25_MAGIC_NUMBER))
|
||||
elog(ERROR, "bm25 index is not valid");
|
||||
docId = metapBuf->nextDocId;
|
||||
metapBuf->nextDocId++;
|
||||
BM25CommitBuffer(buf, state);
|
||||
MarkBufferDirty(buf);
|
||||
UnlockReleaseBuffer(buf);
|
||||
return docId;
|
||||
}
|
||||
|
||||
@ -79,18 +181,17 @@ uint32 BM25AllocateTokenId(Relation index)
|
||||
Page page;
|
||||
BM25MetaPage metapBuf;
|
||||
uint32 tokenId;
|
||||
GenericXLogState *state;
|
||||
|
||||
buf = ReadBuffer(index, BM25_METAPAGE_BLKNO);
|
||||
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
|
||||
state = GenericXLogStart(index);
|
||||
page = GenericXLogRegisterBuffer(state, buf, 0);
|
||||
page = BufferGetPage(buf);
|
||||
metapBuf = BM25PageGetMeta(page);
|
||||
if (unlikely(metapBuf->magicNumber != BM25_MAGIC_NUMBER))
|
||||
elog(ERROR, "bm25 index is not valid");
|
||||
tokenId = metapBuf->nextTokenId;
|
||||
metapBuf->nextTokenId++;
|
||||
BM25CommitBuffer(buf, state);
|
||||
MarkBufferDirty(buf);
|
||||
UnlockReleaseBuffer(buf);
|
||||
return tokenId;
|
||||
}
|
||||
|
||||
@ -99,19 +200,18 @@ void BM25IncreaseDocAndTokenCount(Relation index, uint32 tokenCount, float &avgd
|
||||
Buffer buf;
|
||||
Page page;
|
||||
BM25MetaPage metapBuf;
|
||||
GenericXLogState *state;
|
||||
|
||||
buf = ReadBuffer(index, BM25_METAPAGE_BLKNO);
|
||||
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
|
||||
state = GenericXLogStart(index);
|
||||
page = GenericXLogRegisterBuffer(state, buf, 0);
|
||||
page = BufferGetPage(buf);
|
||||
metapBuf = BM25PageGetMeta(page);
|
||||
if (unlikely(metapBuf->magicNumber != BM25_MAGIC_NUMBER))
|
||||
elog(ERROR, "bm25 index is not valid");
|
||||
metapBuf->documentCount++;
|
||||
metapBuf->tokenCount += tokenCount;
|
||||
avgdl = metapBuf->tokenCount / metapBuf->documentCount;
|
||||
BM25CommitBuffer(buf, state);
|
||||
MarkBufferDirty(buf);
|
||||
UnlockReleaseBuffer(buf);
|
||||
}
|
||||
|
||||
BlockNumber SeekBlocknoForDoc(Relation index, uint32 docId, BlockNumber startBlkno, BlockNumber step)
|
||||
@ -120,7 +220,7 @@ BlockNumber SeekBlocknoForDoc(Relation index, uint32 docId, BlockNumber startBlk
|
||||
Page page;
|
||||
BlockNumber docBlkno = startBlkno;
|
||||
for (int i = 0; i < step; ++i) {
|
||||
if (unlikely(BlockNumberIsValid(docBlkno)) {
|
||||
if (unlikely(BlockNumberIsValid(docBlkno))) {
|
||||
elog(ERROR, "SeekBlocknoForDoc: Invalid Block Number.");
|
||||
}
|
||||
buf = ReadBuffer(index, docBlkno);
|
||||
@ -130,4 +230,32 @@ BlockNumber SeekBlocknoForDoc(Relation index, uint32 docId, BlockNumber startBlk
|
||||
UnlockReleaseBuffer(buf);
|
||||
}
|
||||
return docBlkno;
|
||||
}
|
||||
}
|
||||
|
||||
bool FindHashBucket(uint32 bucketId, BM25PageLocationInfo &bucketLocation, Buffer buf, Page page)
|
||||
{
|
||||
OffsetNumber maxoffno = PageGetMaxOffsetNumber(page);
|
||||
for (OffsetNumber offno = FirstOffsetNumber; offno <= maxoffno; offno = OffsetNumberNext(offno)) {
|
||||
BM25HashBucketPage bucket = (BM25HashBucketPage)PageGetItem(page, PageGetItemId(page, offno));
|
||||
if (bucket->bucketId == bucketId) {
|
||||
bucketLocation.blkno = BufferGetBlockNumber(buf);
|
||||
bucketLocation.offno = offno;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool FindTokenMeta(BM25TokenData &tokenData, BM25PageLocationInfo &tokenMetaLocation, Buffer buf, Page page)
|
||||
{
|
||||
OffsetNumber maxoffno = PageGetMaxOffsetNumber(page);
|
||||
for (OffsetNumber offno = FirstOffsetNumber; offno <= maxoffno; offno = OffsetNumberNext(offno)) {
|
||||
BM25TokenMetaPage tokenMeta = (BM25TokenMetaPage)PageGetItem(page, PageGetItemId(page, offno));
|
||||
if (strncmp(tokenMeta->token, tokenData.tokenValue, BM25_MAX_TOKEN_LEN - 1) == 0) {
|
||||
tokenMetaLocation.blkno = BufferGetBlockNumber(buf);
|
||||
tokenMetaLocation.offno = offno;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -32,7 +32,6 @@
|
||||
#include "access/generic_xlog.h"
|
||||
#include "catalog/pg_operator.h"
|
||||
#include "port.h" /* for random() */
|
||||
#include "db4ai/bayesnet.h"
|
||||
#include "access/datavec/vecindex.h"
|
||||
|
||||
#define BM25_VERSION 1
|
||||
@ -52,6 +51,8 @@
|
||||
#define BM25_DOCUMENT_MAX_COUNT_IN_PAGE (BM25_PAGE_DATASIZE / BM25_DOCUMENT_ITEM_SIZE)
|
||||
#define BM25_DOCUMENT_FORWARD_ITEM_SIZE (MAXALIGN(sizeof(BM25DocForwardItem)))
|
||||
#define BM25_DOC_FORWARD_MAX_COUNT_IN_PAGE (BM25_PAGE_DATASIZE / BM25_DOCUMENT_FORWARD_ITEM_SIZE)
|
||||
#define BM25_MAX_TOKEN_LEN 100
|
||||
#define BM25_BUCKET_MAX_NUM 1000
|
||||
|
||||
typedef struct BM25ScanData {
|
||||
uint32 docId;
|
||||
@ -59,6 +60,19 @@ typedef struct BM25ScanData {
|
||||
ItemPointerData heapCtid;
|
||||
} BM25ScanData;
|
||||
|
||||
typedef struct BM25TokenData {
|
||||
uint32 hashValue;
|
||||
uint32 tokenId;
|
||||
uint32 tokenFreq;
|
||||
char tokenValue[BM25_MAX_TOKEN_LEN];
|
||||
} BM25TokenData;
|
||||
|
||||
typedef struct BM25TokenizedDocData {
|
||||
BM25TokenData* tokenDatas;
|
||||
uint32 tokenCount;
|
||||
uint32 docLength;
|
||||
} BM25TokenizedDocData;
|
||||
|
||||
typedef struct BM25ScanOpaqueData {
|
||||
uint32 cursor;
|
||||
BM25ScanData* candDocs;
|
||||
@ -74,6 +88,8 @@ typedef BM25ScanOpaqueData *BM25ScanOpaque;
|
||||
typedef struct BM25EntryPages {
|
||||
BlockNumber documentMetaPage;
|
||||
BlockNumber docForwardPage;
|
||||
BlockNumber hashBucketsPage;
|
||||
uint32 hashBucketCount;
|
||||
} BM25EntryPages;
|
||||
|
||||
typedef struct BM25MetaPageData {
|
||||
@ -129,11 +145,72 @@ typedef struct BM25DocumentItem {
|
||||
uint64 tokenEndIdx;
|
||||
} BM25DocumentItem;
|
||||
|
||||
typedef struct BM25HashBucketItem {
|
||||
uint32 bucketId;
|
||||
BlockNumber bucketBlkno;
|
||||
} BM25HashBucketItem;
|
||||
|
||||
typedef BM25HashBucketItem *BM25HashBucketPage;
|
||||
|
||||
typedef struct BM25TokenMetaItem {
|
||||
uint32 tokenId;
|
||||
uint32 docCount;
|
||||
BlockNumber postingBlkno;
|
||||
float maxScore;
|
||||
char token[BM25_MAX_TOKEN_LEN];
|
||||
} BM25TokenMetaItem;
|
||||
|
||||
typedef BM25TokenMetaItem *BM25TokenMetaPage;
|
||||
|
||||
typedef struct BM25TokenPostingItem {
|
||||
uint32 docId;
|
||||
uint16 docLength;
|
||||
uint16 freq;
|
||||
} BM25TokenPostingItem;
|
||||
|
||||
typedef BM25TokenPostingItem *BM25TokenPostingPage;
|
||||
|
||||
typedef struct BM25PageLocationInfo {
|
||||
BlockNumber blkno;
|
||||
OffsetNumber offno;
|
||||
} BM25PageLocationInfo;
|
||||
|
||||
typedef struct BM25Shared {
|
||||
/* Immutable state */
|
||||
Oid heaprelid;
|
||||
Oid indexrelid;
|
||||
|
||||
/* Mutex for mutable state */
|
||||
slock_t mutex;
|
||||
|
||||
/* Mutable state */
|
||||
int nparticipantsdone;
|
||||
double reltuples;
|
||||
|
||||
BM25EntryPages bm25EntryPages;
|
||||
|
||||
ParallelHeapScanDescData heapdesc;
|
||||
} BM25Shared;
|
||||
|
||||
typedef struct BM25Leader {
|
||||
int nparticipanttuplesorts;
|
||||
BM25Shared *bm25shared;
|
||||
} BM25Leader;
|
||||
|
||||
typedef struct BM25ReorderShared {
|
||||
BM25PageLocationInfo *startPageLocation;
|
||||
uint32 batchCount;
|
||||
uint32 curThreadId;
|
||||
Oid heaprelid;
|
||||
Oid indexrelid;
|
||||
} BM25ReorderShared;
|
||||
|
||||
typedef struct BM25BuildState {
|
||||
/* Info */
|
||||
Relation heap;
|
||||
Relation index;
|
||||
IndexInfo *indexInfo;
|
||||
ForkNumber forkNum;
|
||||
|
||||
BM25EntryPages bm25EntryPages;
|
||||
|
||||
@ -147,13 +224,40 @@ typedef struct BM25BuildState {
|
||||
|
||||
/* Memory */
|
||||
MemoryContext tmpCtx;
|
||||
|
||||
BM25Leader *bm25leader;
|
||||
} BM25BuildState;
|
||||
|
||||
|
||||
struct BM25Scorer : public BaseObject {
|
||||
public:
|
||||
float m_k1;
|
||||
float m_b;
|
||||
float m_avgdl;
|
||||
|
||||
float GetDocBM25Score(float tf, float docLen) {
|
||||
return tf * (m_k1 + 1) / (tf + m_k1 * (1 - m_b + m_b * (docLen / m_avgdl)));
|
||||
}
|
||||
|
||||
BM25Scorer(float k1, float b, float avgdl)
|
||||
: m_k1(k1), m_b(b), m_avgdl(avgdl) {
|
||||
}
|
||||
}; // struct BM25Scorer
|
||||
|
||||
/* Methods */
|
||||
Buffer BM25NewBuffer(Relation index, ForkNumber forkNum);
|
||||
void BM25InitPage(Buffer buf, Page page);
|
||||
void BM25InitRegisterPage(Relation index, Buffer *buf, Page *page, GenericXLogState **state);
|
||||
void BM25CommitBuffer(Buffer buf, GenericXLogState *state);
|
||||
void BM25AppendPage(Relation index, Buffer *buf, Page *page, ForkNumber forkNum, bool unlockOldBuf = true);
|
||||
void BM25GetMetaPageInfo(Relation index, BM25MetaPage metap);
|
||||
uint32 BM25AllocateDocId(Relation index);
|
||||
uint32 BM25AllocateTokenId(Relation index);
|
||||
void BM25IncreaseDocAndTokenCount(Relation index, uint32 tokenCount, float &avgdl);
|
||||
BlockNumber SeekBlocknoForDoc(Relation index, uint32 docId, BlockNumber startBlkno, BlockNumber step);
|
||||
bool FindHashBucket(uint32 bucketId, BM25PageLocationInfo &bucketLocation, Buffer buf, Page page);
|
||||
bool FindTokenMeta(BM25TokenData &tokenData, BM25PageLocationInfo &tokenMetaLocation, Buffer buf, Page page);
|
||||
BM25TokenizedDocData BM25DocumentTokenize(const char* doc);
|
||||
|
||||
Datum bm25build(PG_FUNCTION_ARGS);
|
||||
Datum bm25buildempty(PG_FUNCTION_ARGS);
|
||||
|
||||
Reference in New Issue
Block a user