bm25: build index(inverted_list, parallel_building), tokenizer for (docs & search).

(cherry picked from commit ea882f08feabb5d386cde15a828581bc3665429b)
This commit is contained in:
wlff234
2025-05-04 20:30:52 +08:00
parent c7bfa30b13
commit dedb5cbc77
7 changed files with 1161 additions and 104 deletions

View File

@ -64,6 +64,7 @@ if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/chparser)
endif()
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/age)
add_subdirectory(age)
endif()
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/jieba_tokenizer)
add_subdirectory(jieba_tokenizer)
endif()

View File

@ -30,7 +30,7 @@
#include "tokenizer.h"
const size_t MAX_LENGTH_CRC = 100;
const size_t MAX_KEYWORD_NUM = 100;
const size_t MAX_KEYWORD_NUM = 100000;
const size_t MAX_PATH_LEN = 1024;
const char* const DICT_PATH = "lib/jieba_dict/jieba.dict.utf8";
@ -66,22 +66,31 @@ inline static uint32_t HashString2Uint32(const std::string& srcStr)
return crc;
}
inline static void ConvertEmbeddingMap(std::unordered_map<uint32_t, float> tempMap, EmbeddingMap *embeddingMap)
inline static void ConvertEmbeddingMap(std::unordered_map<std::string, std::pair<uint32_t, float>> tokensMap,
EmbeddingMap *embeddingMap)
{
embeddingMap->size = tempMap.size();
embeddingMap->size = tokensMap.size();
if (embeddingMap->size == 0) {
return;
}
embeddingMap->pairs = (EmbeddingPair *)malloc(embeddingMap->size * sizeof(EmbeddingPair));
if (embeddingMap->pairs == nullptr) {
embeddingMap->tokens = (EmbeddingTokenInfo *)malloc(embeddingMap->size * sizeof(EmbeddingTokenInfo));
if (embeddingMap->tokens == nullptr) {
embeddingMap->size = 0;
return;
}
size_t index = 0;
for (const auto& pair : tempMap) {
embeddingMap->pairs[index].key = pair.first;
embeddingMap->pairs[index].value = pair.second;
index++;
size_t idx = 0;
for (const auto& token : tokensMap) {
embeddingMap->tokens[idx].key = token.second.first;
embeddingMap->tokens[idx].value = token.second.second;
errno_t rc = strncpy_s(embeddingMap->tokens[idx].token, MAX_TOKEN_LEN, token.first.c_str(), MAX_TOKEN_LEN - 1);
if (rc != EOK) {
free(embeddingMap->tokens);
embeddingMap->tokens = nullptr;
embeddingMap->size = 0;
return;
}
idx++;
}
}
@ -148,31 +157,36 @@ bool ConvertString2Embedding(const char* srcStr, EmbeddingMap *embeddingMap, boo
if (jiebaTokenizer == nullptr || srcStr == nullptr || embeddingMap == nullptr) {
return false;
}
std::string srcStrInput(srcStr);
std::unordered_map<uint32_t, float> tempMap;
std::string sentence(srcStr);
std::unordered_map<std::string, std::pair<uint32_t, float>> tokensMap;
if (isKeywordExtractor) {
std::vector<cppjieba::KeywordExtractor::Word> keywords;
jiebaTokenizer->extractor.Extract(srcStrInput, keywords, MAX_KEYWORD_NUM);
jiebaTokenizer->extractor.Extract(sentence, keywords, MAX_KEYWORD_NUM);
for (const auto& keyword : keywords) {
uint32_t hashValue = HashString2Uint32(Convert2LowerCase(keyword.word));
tempMap[hashValue] += keyword.weight;
tokensMap[keyword.word] = std::make_pair(hashValue, keyword.weight);
}
if (!tempMap.empty()) {
ConvertEmbeddingMap(tempMap, embeddingMap);
if (!tokensMap.empty()) {
ConvertEmbeddingMap(tokensMap, embeddingMap);
return true;
}
}
// if the keywords extracted by 'Extract' are empty, then use 'Cut' for tokenization.
std::vector<std::string> tokens;
jiebaTokenizer->Cut(srcStrInput, tokens, true);
jiebaTokenizer->Cut(sentence, tokens, true);
for (const auto& token : tokens) {
if (IsWhitespace(token)) {
continue;
}
uint32_t hashValue = HashString2Uint32(Convert2LowerCase(token));
tempMap[hashValue] += 1.0f;
if (tokensMap.find(token) == tokensMap.end()) {
tokensMap[token] = std::make_pair(hashValue, 1.0f);
} else {
tokensMap[token].second += 1.0f;
}
}
ConvertEmbeddingMap(tempMap, embeddingMap);
ConvertEmbeddingMap(tokensMap, embeddingMap);
return true;
}

View File

@ -27,17 +27,20 @@
#include <stdint.h>
#define MAX_TOKEN_LEN 100
#ifdef __cplusplus
extern "C" {
#endif
typedef struct EmbeddingPair {
typedef struct EmbeddingTokenInfo {
uint32_t key;
float value;
} EmbeddingPair;
char token[MAX_TOKEN_LEN];
} EmbeddingTokenInfo;
typedef struct {
EmbeddingPair *pairs;
EmbeddingTokenInfo *tokens;
size_t size;
} EmbeddingMap;

File diff suppressed because it is too large Load Diff

View File

@ -32,11 +32,133 @@
#include "access/heapam.h"
#include "catalog/index.h"
#include "access/tableam.h"
#include "db4ai/bayesnet.h"
#include "access/datavec/bm25.h"
typedef struct BM25QueryToken {
BlockNumber tokenPostingBlock;
float qTokenMaxScore;
float qTokenIDFVal;
} BM25QueryToken;
typedef struct BM25QueryTokensInfo {
BM25QueryToken *queryTokens;
uint32 size;
} BM25QueryTokensInfo;
static void FindBucketsLocation(Page page, BM25TokenizedDocData &tokenizedQuery, BlockNumber *bucketsLocation, uint32 &bucketFoundCount)
{
OffsetNumber maxoffno = PageGetMaxOffsetNumber(page);
for (OffsetNumber offnoBucket = FirstOffsetNumber; offnoBucket <= maxoffno; offnoBucket++) {
BM25HashBucketPage bucket = (BM25HashBucketPage)PageGetItem(page, PageGetItemId(page, offnoBucket));
for (size_t tokenIdx = 0; tokenIdx < tokenizedQuery.tokenCount; tokenIdx++) {
if (bucketsLocation[tokenIdx] == InvalidBlockNumber &&
bucket->bucketId == (tokenizedQuery.tokenDatas[tokenIdx].hashValue % BM25_BUCKET_MAX_NUM)) {
bucketsLocation[tokenIdx] = bucket->bucketBlkno;
bucketFoundCount++;
if (bucketFoundCount >= tokenizedQuery.tokenCount)
return;
}
}
}
return;
}
static void FindTokenInfo(BM25MetaPageData &meta, Page page, BM25TokenizedDocData &tokenizedQuery,
BM25QueryToken *queryTokens, size_t tokenIdx, uint32 &tokenFoundCount)
{
OffsetNumber maxoffno = PageGetMaxOffsetNumber(page);
for (OffsetNumber offnoTokenMeta = FirstOffsetNumber; offnoTokenMeta <= maxoffno; offnoTokenMeta++) {
BM25TokenMetaPage tokenMeta = (BM25TokenMetaPage)PageGetItem(page, PageGetItemId(page, offnoTokenMeta));
if (strncmp(tokenMeta->token, tokenizedQuery.tokenDatas[tokenIdx].tokenValue, BM25_MAX_TOKEN_LEN - 1) == 0) {
queryTokens[tokenIdx].qTokenMaxScore = tokenMeta->maxScore;
queryTokens[tokenIdx].tokenPostingBlock = tokenMeta->postingBlkno;
queryTokens[tokenIdx].qTokenIDFVal = tokenizedQuery.tokenDatas[tokenIdx].tokenFreq *
std::log((1 + ((float)meta.documentCount - (float)tokenMeta->docCount + 0.5) / ((float)tokenMeta->docCount + 0.5)));
tokenFoundCount++;
if (tokenFoundCount >= tokenizedQuery.tokenCount)
return;
}
}
return;
}
static BM25QueryTokensInfo GetQueryTokens(Relation index, const char* sentence)
{
BM25TokenizedDocData tokenizedQuery = BM25DocumentTokenize(sentence);
if (tokenizedQuery.tokenCount == 0) {
BM25QueryTokensInfo emptyTokensInfo{0};
emptyTokensInfo.queryTokens = nullptr;
emptyTokensInfo.size = 0;
return emptyTokensInfo;
}
BM25QueryToken *queryTokens = (BM25QueryToken*)palloc0(sizeof(BM25QueryToken) * tokenizedQuery.tokenCount);
BlockNumber *bucketsLocation = (BlockNumber*)palloc0(sizeof(BlockNumber) * tokenizedQuery.tokenCount);
for (size_t tokenIdx = 0; tokenIdx < tokenizedQuery.tokenCount; tokenIdx++) {
queryTokens[tokenIdx].tokenPostingBlock = InvalidBlockNumber;
bucketsLocation[tokenIdx] = InvalidBlockNumber;
}
/* scan index for queryToken info */
uint32 bucketFoundCount = 0;
BM25MetaPageData meta;
BM25GetMetaPageInfo(index, &meta);
BlockNumber hashBucketsBlkno = meta.entryPageList.hashBucketsPage;
BlockNumber nextHashBucketsBlkno = hashBucketsBlkno;
Buffer cHashBucketsbuf;
Page cHashBucketspage;
while (bucketFoundCount < tokenizedQuery.tokenCount && BlockNumberIsValid(nextHashBucketsBlkno)) {
OffsetNumber maxoffno;
cHashBucketsbuf = ReadBuffer(index, nextHashBucketsBlkno);
LockBuffer(cHashBucketsbuf, BUFFER_LOCK_SHARE);
cHashBucketspage = BufferGetPage(cHashBucketsbuf);
FindBucketsLocation(cHashBucketspage, tokenizedQuery, bucketsLocation, bucketFoundCount);
nextHashBucketsBlkno = BM25PageGetOpaque(cHashBucketspage)->nextblkno;
UnlockReleaseBuffer(cHashBucketsbuf);
}
uint32 tokenFoundCount = 0;
for (size_t tokenIdx = 0; tokenIdx < tokenizedQuery.tokenCount; tokenIdx++) {
if (!BlockNumberIsValid(bucketsLocation[tokenIdx])) {
continue;
}
Buffer cTokenMetasbuf;
Page cTokenMetaspage;
BlockNumber nextTokenMetasBlkno = bucketsLocation[tokenIdx];
while (tokenFoundCount < tokenizedQuery.tokenCount && BlockNumberIsValid(nextTokenMetasBlkno)) {
OffsetNumber maxoffno;
cTokenMetasbuf = ReadBuffer(index, nextTokenMetasBlkno);
LockBuffer(cTokenMetasbuf, BUFFER_LOCK_SHARE);
cTokenMetaspage = BufferGetPage(cTokenMetasbuf);
FindTokenInfo(meta, cTokenMetaspage, tokenizedQuery, queryTokens, tokenIdx, tokenFoundCount);
nextTokenMetasBlkno = BM25PageGetOpaque(cTokenMetaspage)->nextblkno;
UnlockReleaseBuffer(cTokenMetasbuf);
}
}
BM25QueryToken *resQueryTokens = (BM25QueryToken*)palloc0(sizeof(BM25QueryToken) * tokenFoundCount);
uint32 tokenFillIdx = 0;
for (size_t tokenIdx = 0; tokenIdx < tokenizedQuery.tokenCount; tokenIdx++) {
if (!BlockNumberIsValid(queryTokens[tokenIdx].tokenPostingBlock)) {
continue;
}
resQueryTokens[tokenFillIdx] = queryTokens[tokenIdx];
tokenFillIdx++;
if (tokenFillIdx >= tokenFoundCount) {
break;
}
}
pfree(queryTokens);
pfree(bucketsLocation);
BM25QueryTokensInfo tokensInfo{0};
tokensInfo.queryTokens = resQueryTokens;
tokensInfo.size = tokenFoundCount;
return tokensInfo;
}
IndexScanDesc bm25beginscan_internal(Relation index, int nkeys, int norderbys)
{
IndexScanDesc scan;
IndexScanDesc scan = RelationGetIndexScan(index, nkeys, norderbys);
return scan;
}

View File

@ -31,6 +31,109 @@
#include "access/datavec/bm25.h"
#include "access/datavec/utils.h"
#include "storage/buf/bufmgr.h"
#include "tokenizer.h"
slock_t newBufferMutex;
BM25TokenizedDocData BM25DocumentTokenize(const char* doc)
{
uint32 docLength = 0;
EmbeddingMap embeddingMap{0};
ConvertString2Embedding(doc, &embeddingMap, true);
BM25TokenizedDocData tokenizedData = {};
BM25TokenData* tokenDatas = (BM25TokenData*)palloc0(sizeof(BM25TokenData) * embeddingMap.size);
for (size_t idx = 0; idx < embeddingMap.size; idx++) {
tokenDatas[idx].hashValue = embeddingMap.tokens[idx].key;
tokenDatas[idx].tokenFreq = embeddingMap.tokens[idx].value;
errno_t rc = strncpy_s(tokenDatas[idx].tokenValue, BM25_MAX_TOKEN_LEN, embeddingMap.tokens[idx].token,
BM25_MAX_TOKEN_LEN - 1);
if (rc != EOK) {
pfree(tokenDatas);
tokenDatas = nullptr;
docLength = 0;
embeddingMap.size = 0;
break;
}
tokenDatas[idx].tokenId = 0;
docLength += embeddingMap.tokens[idx].value;
}
tokenizedData.tokenDatas = tokenDatas;
tokenizedData.tokenCount = embeddingMap.size;
tokenizedData.docLength = docLength;
if (embeddingMap.tokens != nullptr) {
free(embeddingMap.tokens);
embeddingMap.tokens = nullptr;
}
return tokenizedData;
}
/*
* New buffer
*/
Buffer BM25NewBuffer(Relation index, ForkNumber forkNum)
{
SpinLockAcquire(&newBufferMutex);
Buffer buf = ReadBufferExtended(index, forkNum, P_NEW, RBM_NORMAL, NULL);
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
SpinLockRelease(&newBufferMutex);
return buf;
}
/*
* Init page
*/
void BM25InitPage(Buffer buf, Page page)
{
PageInit(page, BufferGetPageSize(buf), sizeof(BM25PageOpaqueData));
BM25PageGetOpaque(page)->nextblkno = InvalidBlockNumber;
BM25PageGetOpaque(page)->page_id = BM25_PAGE_ID;
}
/*
* Init and register page
*/
void BM25InitRegisterPage(Relation index, Buffer *buf, Page *page, GenericXLogState **state)
{
*state = GenericXLogStart(index);
*page = GenericXLogRegisterBuffer(*state, *buf, GENERIC_XLOG_FULL_IMAGE);
BM25InitPage(*buf, *page);
}
/*
* Commit buffer
*/
void BM25CommitBuffer(Buffer buf, GenericXLogState *state)
{
GenericXLogFinish(state);
UnlockReleaseBuffer(buf);
}
/*
* Add a new page
*
* The order is very important!!
*/
void BM25AppendPage(Relation index, Buffer *buf, Page *page, ForkNumber forkNum, bool unlockOldBuf)
{
/* Get new buffer */
Buffer newbuf = BM25NewBuffer(index, forkNum);
Page newpage = BufferGetPage(newbuf);
/* Update the previous buffer */
BM25PageGetOpaque(*page)->nextblkno = BufferGetBlockNumber(newbuf);
/* Init new page */
BM25InitPage(newbuf, newpage);
MarkBufferDirty(*buf);
if (unlockOldBuf) {
/* Unlock */
UnlockReleaseBuffer(*buf);
}
*page = BufferGetPage(newbuf);
*buf = newbuf;
}
/*
* Get the metapage info
@ -58,18 +161,17 @@ uint32 BM25AllocateDocId(Relation index)
Page page;
BM25MetaPage metapBuf;
uint32 docId;
GenericXLogState *state;
buf = ReadBuffer(index, BM25_METAPAGE_BLKNO);
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
state = GenericXLogStart(index);
page = GenericXLogRegisterBuffer(state, buf, 0);
page = BufferGetPage(buf);
metapBuf = BM25PageGetMeta(page);
if (unlikely(metapBuf->magicNumber != BM25_MAGIC_NUMBER))
elog(ERROR, "bm25 index is not valid");
docId = metapBuf->nextDocId;
metapBuf->nextDocId++;
BM25CommitBuffer(buf, state);
MarkBufferDirty(buf);
UnlockReleaseBuffer(buf);
return docId;
}
@ -79,18 +181,17 @@ uint32 BM25AllocateTokenId(Relation index)
Page page;
BM25MetaPage metapBuf;
uint32 tokenId;
GenericXLogState *state;
buf = ReadBuffer(index, BM25_METAPAGE_BLKNO);
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
state = GenericXLogStart(index);
page = GenericXLogRegisterBuffer(state, buf, 0);
page = BufferGetPage(buf);
metapBuf = BM25PageGetMeta(page);
if (unlikely(metapBuf->magicNumber != BM25_MAGIC_NUMBER))
elog(ERROR, "bm25 index is not valid");
tokenId = metapBuf->nextTokenId;
metapBuf->nextTokenId++;
BM25CommitBuffer(buf, state);
MarkBufferDirty(buf);
UnlockReleaseBuffer(buf);
return tokenId;
}
@ -99,19 +200,18 @@ void BM25IncreaseDocAndTokenCount(Relation index, uint32 tokenCount, float &avgd
Buffer buf;
Page page;
BM25MetaPage metapBuf;
GenericXLogState *state;
buf = ReadBuffer(index, BM25_METAPAGE_BLKNO);
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
state = GenericXLogStart(index);
page = GenericXLogRegisterBuffer(state, buf, 0);
page = BufferGetPage(buf);
metapBuf = BM25PageGetMeta(page);
if (unlikely(metapBuf->magicNumber != BM25_MAGIC_NUMBER))
elog(ERROR, "bm25 index is not valid");
metapBuf->documentCount++;
metapBuf->tokenCount += tokenCount;
avgdl = metapBuf->tokenCount / metapBuf->documentCount;
BM25CommitBuffer(buf, state);
MarkBufferDirty(buf);
UnlockReleaseBuffer(buf);
}
BlockNumber SeekBlocknoForDoc(Relation index, uint32 docId, BlockNumber startBlkno, BlockNumber step)
@ -120,7 +220,7 @@ BlockNumber SeekBlocknoForDoc(Relation index, uint32 docId, BlockNumber startBlk
Page page;
BlockNumber docBlkno = startBlkno;
for (int i = 0; i < step; ++i) {
if (unlikely(BlockNumberIsValid(docBlkno)) {
if (unlikely(BlockNumberIsValid(docBlkno))) {
elog(ERROR, "SeekBlocknoForDoc: Invalid Block Number.");
}
buf = ReadBuffer(index, docBlkno);
@ -130,4 +230,32 @@ BlockNumber SeekBlocknoForDoc(Relation index, uint32 docId, BlockNumber startBlk
UnlockReleaseBuffer(buf);
}
return docBlkno;
}
}
bool FindHashBucket(uint32 bucketId, BM25PageLocationInfo &bucketLocation, Buffer buf, Page page)
{
OffsetNumber maxoffno = PageGetMaxOffsetNumber(page);
for (OffsetNumber offno = FirstOffsetNumber; offno <= maxoffno; offno = OffsetNumberNext(offno)) {
BM25HashBucketPage bucket = (BM25HashBucketPage)PageGetItem(page, PageGetItemId(page, offno));
if (bucket->bucketId == bucketId) {
bucketLocation.blkno = BufferGetBlockNumber(buf);
bucketLocation.offno = offno;
return true;
}
}
return false;
}
bool FindTokenMeta(BM25TokenData &tokenData, BM25PageLocationInfo &tokenMetaLocation, Buffer buf, Page page)
{
OffsetNumber maxoffno = PageGetMaxOffsetNumber(page);
for (OffsetNumber offno = FirstOffsetNumber; offno <= maxoffno; offno = OffsetNumberNext(offno)) {
BM25TokenMetaPage tokenMeta = (BM25TokenMetaPage)PageGetItem(page, PageGetItemId(page, offno));
if (strncmp(tokenMeta->token, tokenData.tokenValue, BM25_MAX_TOKEN_LEN - 1) == 0) {
tokenMetaLocation.blkno = BufferGetBlockNumber(buf);
tokenMetaLocation.offno = offno;
return true;
}
}
return false;
}

View File

@ -32,7 +32,6 @@
#include "access/generic_xlog.h"
#include "catalog/pg_operator.h"
#include "port.h" /* for random() */
#include "db4ai/bayesnet.h"
#include "access/datavec/vecindex.h"
#define BM25_VERSION 1
@ -52,6 +51,8 @@
#define BM25_DOCUMENT_MAX_COUNT_IN_PAGE (BM25_PAGE_DATASIZE / BM25_DOCUMENT_ITEM_SIZE)
#define BM25_DOCUMENT_FORWARD_ITEM_SIZE (MAXALIGN(sizeof(BM25DocForwardItem)))
#define BM25_DOC_FORWARD_MAX_COUNT_IN_PAGE (BM25_PAGE_DATASIZE / BM25_DOCUMENT_FORWARD_ITEM_SIZE)
#define BM25_MAX_TOKEN_LEN 100
#define BM25_BUCKET_MAX_NUM 1000
typedef struct BM25ScanData {
uint32 docId;
@ -59,6 +60,19 @@ typedef struct BM25ScanData {
ItemPointerData heapCtid;
} BM25ScanData;
typedef struct BM25TokenData {
uint32 hashValue;
uint32 tokenId;
uint32 tokenFreq;
char tokenValue[BM25_MAX_TOKEN_LEN];
} BM25TokenData;
typedef struct BM25TokenizedDocData {
BM25TokenData* tokenDatas;
uint32 tokenCount;
uint32 docLength;
} BM25TokenizedDocData;
typedef struct BM25ScanOpaqueData {
uint32 cursor;
BM25ScanData* candDocs;
@ -74,6 +88,8 @@ typedef BM25ScanOpaqueData *BM25ScanOpaque;
typedef struct BM25EntryPages {
BlockNumber documentMetaPage;
BlockNumber docForwardPage;
BlockNumber hashBucketsPage;
uint32 hashBucketCount;
} BM25EntryPages;
typedef struct BM25MetaPageData {
@ -129,11 +145,72 @@ typedef struct BM25DocumentItem {
uint64 tokenEndIdx;
} BM25DocumentItem;
typedef struct BM25HashBucketItem {
uint32 bucketId;
BlockNumber bucketBlkno;
} BM25HashBucketItem;
typedef BM25HashBucketItem *BM25HashBucketPage;
typedef struct BM25TokenMetaItem {
uint32 tokenId;
uint32 docCount;
BlockNumber postingBlkno;
float maxScore;
char token[BM25_MAX_TOKEN_LEN];
} BM25TokenMetaItem;
typedef BM25TokenMetaItem *BM25TokenMetaPage;
typedef struct BM25TokenPostingItem {
uint32 docId;
uint16 docLength;
uint16 freq;
} BM25TokenPostingItem;
typedef BM25TokenPostingItem *BM25TokenPostingPage;
typedef struct BM25PageLocationInfo {
BlockNumber blkno;
OffsetNumber offno;
} BM25PageLocationInfo;
typedef struct BM25Shared {
/* Immutable state */
Oid heaprelid;
Oid indexrelid;
/* Mutex for mutable state */
slock_t mutex;
/* Mutable state */
int nparticipantsdone;
double reltuples;
BM25EntryPages bm25EntryPages;
ParallelHeapScanDescData heapdesc;
} BM25Shared;
typedef struct BM25Leader {
int nparticipanttuplesorts;
BM25Shared *bm25shared;
} BM25Leader;
typedef struct BM25ReorderShared {
BM25PageLocationInfo *startPageLocation;
uint32 batchCount;
uint32 curThreadId;
Oid heaprelid;
Oid indexrelid;
} BM25ReorderShared;
typedef struct BM25BuildState {
/* Info */
Relation heap;
Relation index;
IndexInfo *indexInfo;
ForkNumber forkNum;
BM25EntryPages bm25EntryPages;
@ -147,13 +224,40 @@ typedef struct BM25BuildState {
/* Memory */
MemoryContext tmpCtx;
BM25Leader *bm25leader;
} BM25BuildState;
struct BM25Scorer : public BaseObject {
public:
float m_k1;
float m_b;
float m_avgdl;
float GetDocBM25Score(float tf, float docLen) {
return tf * (m_k1 + 1) / (tf + m_k1 * (1 - m_b + m_b * (docLen / m_avgdl)));
}
BM25Scorer(float k1, float b, float avgdl)
: m_k1(k1), m_b(b), m_avgdl(avgdl) {
}
}; // struct BM25Scorer
/* Methods */
Buffer BM25NewBuffer(Relation index, ForkNumber forkNum);
void BM25InitPage(Buffer buf, Page page);
void BM25InitRegisterPage(Relation index, Buffer *buf, Page *page, GenericXLogState **state);
void BM25CommitBuffer(Buffer buf, GenericXLogState *state);
void BM25AppendPage(Relation index, Buffer *buf, Page *page, ForkNumber forkNum, bool unlockOldBuf = true);
void BM25GetMetaPageInfo(Relation index, BM25MetaPage metap);
uint32 BM25AllocateDocId(Relation index);
uint32 BM25AllocateTokenId(Relation index);
void BM25IncreaseDocAndTokenCount(Relation index, uint32 tokenCount, float &avgdl);
BlockNumber SeekBlocknoForDoc(Relation index, uint32 docId, BlockNumber startBlkno, BlockNumber step);
bool FindHashBucket(uint32 bucketId, BM25PageLocationInfo &bucketLocation, Buffer buf, Page page);
bool FindTokenMeta(BM25TokenData &tokenData, BM25PageLocationInfo &tokenMetaLocation, Buffer buf, Page page);
BM25TokenizedDocData BM25DocumentTokenize(const char* doc);
Datum bm25build(PG_FUNCTION_ARGS);
Datum bm25buildempty(PG_FUNCTION_ARGS);