diff --git a/src/common/backend/nodes/tidbitmap.cpp b/src/common/backend/nodes/tidbitmap.cpp index 00dc5d881..398638781 100644 --- a/src/common/backend/nodes/tidbitmap.cpp +++ b/src/common/backend/nodes/tidbitmap.cpp @@ -45,6 +45,7 @@ #include "nodes/bitmapset.h" #include "nodes/tidbitmap.h" #include "utils/hsearch.h" +#include "utils/hashutils.h" #include "access/ustore/knl_upage.h" /* @@ -64,12 +65,12 @@ * for that page in the page table. * * We actually store both exact pages and lossy chunks in the same hash - * table, using identical data structures. (This is because dynahash.c's - * memory management doesn't allow space to be transferred easily from one - * hashtable to another.) Therefore it's best if PAGES_PER_CHUNK is the - * same as MAX_TUPLES_PER_PAGE, or at least not too different. But we - * also want PAGES_PER_CHUNK to be a power of 2 to avoid expensive integer - * remainder operations. So, define it like this: + * table, using identical data structures. (This is because the memory + * management for hashtables doesn't easily/efficiently allow space to be + * transferred easily from onehashtable to another.) Therefore it's best + * if PAGES_PER_CHUNK is the same as MAX_TUPLES_PER_PAGE, or at least not + * too different. But wealso want PAGES_PER_CHUNK to be a power of 2 to + * avoid expensive integer remainder operations. So, define it like this: */ #define PAGES_PER_HEAP_CHUNK (BLCKSZ / 32) #define PAGES_PER_UHEAP_CHUNK (BLCKSZ / 16) @@ -107,7 +108,7 @@ /* * Used as key of hash table for PagetableEntry. */ -typedef struct PagetableEntryNode_s { +typedef struct PagetableEntryNode { BlockNumber blockNo; /* page number (hashtable key) */ Oid partitionOid; /* used for GLOBAL partition index to indicate partition table */ int2 bucketid; /* used for cross-bucket index on hashbucket table */ @@ -129,21 +130,22 @@ typedef struct PagetableEntryNode_s { */ typedef struct PagetableEntry { PagetableEntryNode entryNode; + char status; /* hash entry status */ bool ischunk; /* T = lossy storage, F = exact */ - bool recheck; /* should the tuples be rechecked? */ + bool recheck; /* should the tuples be rechecked? */ bitmapword words[Max(Max(WORDS_PER_HEAP_PAGE, WORDS_PER_HEAP_CHUNK), Max(WORDS_PER_UHEAP_PAGE, WORDS_PER_UHEAP_CHUNK))]; } PagetableEntry; /* - * dynahash.c is optimized for relatively large, long-lived hash tables. - * This is not ideal for TIDBitMap, particularly when we are using a bitmap - * scan on the inside of a nestloop join: a bitmap may well live only long - * enough to accumulate one entry in such cases. We therefore avoid creating - * an actual hashtable until we need two pagetable entries. When just one - * pagetable entry is needed, we store it in a fixed field of TIDBitMap. - * (NOTE: we don't get rid of the hashtable if the bitmap later shrinks down - * to zero or one page again. So, status can be TBM_HASH even when nentries - * is zero or one.) + * We want to avoid the overhead of creating the hashtable, which is + * comparatively large, when not necessary.particularly when we are using a + * bitmap scan on the inside of a nestloop join: a bitmap may well live only + * long enough to accumulate one entry in such cases. We therefore avoid + * creating an actual hashtable until we need two pagetable entries. When + * just one pagetable entry is needed, we store it in a fixed field of + * TIDBitMap. (NOTE: we don't get rid of the hashtable if the bitmap later + * shrinks down to zero or one page again. So, status can be TBM_HASH even + * when nentries is zero or one.) */ typedef enum { TBM_EMPTY, /* no hashtable, nentries == 0 */ @@ -151,6 +153,16 @@ typedef enum { TBM_HASH /* pagetable is valid, entry1 is not */ } TBMStatus; +/* + * Marks a tbm hash table type, used in template. + */ +typedef enum { + TBM_DYNAMIC_HASH, /* use dynamic hash table */ + TBM_SIMPLE_HASH, /* use simple hash table */ +} TBMHashType; + +#define TBM_TEMPLATE template + /* * Here is the representation for a whole TIDBitMap: */ @@ -158,18 +170,21 @@ struct TIDBitmap { NodeTag type; /* to make it a valid Node */ MemoryContext mcxt; /* memory context containing me */ TBMStatus status; /* see codes above */ + TBMHandler handler; /* tid bitmap handlers */ HTAB* pagetable; /* hash table of PagetableEntry's */ + struct pagetable_hash* simple_pagetable; /* hash table of simplehash implementation */ int nentries; /* number of entries in pagetable */ int maxentries; /* limit on same to meet maxbytes */ int npages; /* number of exact entries in pagetable */ int nchunks; /* number of lossy entries in pagetable */ bool iterating; /* tbm_begin_iterate called? */ - bool isGlobalPart; /* represent global partition index tbm */ - bool crossbucket; /* represent crossbucket index tbm */ + uint32 lossify_start; /* offset to start lossifying hashtable at */ PagetableEntry entry1; /* used when status == TBM_ONE_PAGE */ /* these are valid when iterating is true: */ PagetableEntry** spages; /* sorted exact-page list, or NULL */ PagetableEntry** schunks; /* sorted lossy-chunk list, or NULL */ + bool is_global_part; /* is global index */ + bool is_crossbucket; /* is crossbucket index */ bool is_ustore; int max_tuples_page; int pages_per_chunk; @@ -190,26 +205,76 @@ struct TBMIterator { TBMIterateResult output; /* MUST BE LAST (because variable-size) */ }; -/* Local function prototypes */ -static void tbm_union_page(TIDBitmap* a, const PagetableEntry* bpage); -static bool tbm_intersect_page(TIDBitmap* a, PagetableEntry* apage, const TIDBitmap* b); -static const PagetableEntry* tbm_find_pageentry(const TIDBitmap* tbm, PagetableEntryNode pageNode); -static PagetableEntry* tbm_get_pageentry(TIDBitmap* tbm, PagetableEntryNode pageNode); -static bool tbm_page_is_lossy(const TIDBitmap* tbm, PagetableEntryNode pageNode); -static void tbm_mark_page_lossy(TIDBitmap* tbm, PagetableEntryNode pageNode); -static void tbm_lossify(TIDBitmap* tbm); -static int tbm_comparator(const void* left, const void* right); +/* + * Local function prototypes + */ +TBM_TEMPLATE static void tbm_create_pagetable(TIDBitmap* tbm); +TBM_TEMPLATE static void tbm_init_handlers(TIDBitmap* tbm); +TBM_TEMPLATE static void tbm_add_tuples(TIDBitmap* tbm, const ItemPointer tids, int ntids, bool recheck, Oid partitionOid = InvalidOid, int2 bucketid = InvalidBktId); +TBM_TEMPLATE static void tbm_add_page(TIDBitmap* tbm, BlockNumber pageno, Oid partitionOid = InvalidOid, int2 bucketid = InvalidBktId); + +/* tid bitmap operation prototypes */ +TBM_TEMPLATE static void tbm_union(TIDBitmap* a, const TIDBitmap* bpage); +TBM_TEMPLATE static void tbm_intersect(TIDBitmap* a, const TIDBitmap* b); +TBM_TEMPLATE static void tbm_union_page(TIDBitmap* a, const PagetableEntry* bpage); +TBM_TEMPLATE static bool tbm_intersect_page(TIDBitmap* a, PagetableEntry* apage, const TIDBitmap* b); + +/* tid bitmap iterator prototypes */ +TBM_TEMPLATE static TBMIterator* tbm_begin_iterate(TIDBitmap* tbm); + +/* tid bitmap page entry prototypes */ +TBM_TEMPLATE static const PagetableEntry* tbm_find_pageentry(const TIDBitmap* tbm, PagetableEntryNode pageNode); +TBM_TEMPLATE static PagetableEntry* tbm_get_pageentry(TIDBitmap* tbm, PagetableEntryNode pageNode); + +/* tid bitmap lossy prototypes */ +TBM_TEMPLATE static bool tbm_page_is_lossy(const TIDBitmap* tbm, PagetableEntryNode pageNode); +TBM_TEMPLATE static void tbm_mark_page_lossy(TIDBitmap* tbm, PagetableEntryNode pageNode); +TBM_TEMPLATE static void tbm_lossify(TIDBitmap* tbm); +TBM_TEMPLATE static inline void tbm_lossify_generic_iterate(TIDBitmap* tbm); +TBM_TEMPLATE static inline void tbm_lossify_simple_iterate(TIDBitmap* tbm); + +/* tid bitmap utility prototypes */ +TBM_TEMPLATE static int tbm_comparator(const void* left, const void* right); /* - * TbmCreate - create an initially-empty bitmap + * tbm_hash_complex_key : private hash function for pagetableEntryNode + */ +static inline uint32 tbm_hash_complex_key(const void* key, Size keysize) +{ + PagetableEntryNode* node = (PagetableEntryNode*)key; + uint32 ret = murmurhash32(node->blockNo); + + ret = hash_combine(ret, murmurhash32(node->partitionOid)); + ret = hash_combine(ret, murmurhash32(node->bucketid)); + return ret; +} + +/* define hashtable mapping block numbers to PagetableEntry's */ +#define SH_PREFIX pagetable +#define SH_ELEMENT_TYPE PagetableEntry +#define SH_KEY_TYPE BlockNumber +#define SH_KEY entryNode.blockNo +#define SH_HASH_KEY(tb, key) murmurhash32(key) +#define SH_EQUAL(tb, a, b) (a == b) +#define SH_SCOPE static inline +#define SH_DEFINE +#define SH_DECLARE +#include "lib/simplehash.h" + +/* + * tbm_create - create an initially-empty bitmap * * The bitmap will live in the memory context that is CurrentMemoryContext * at the time of this call. It will be limited to (approximately) maxbytes * total memory consumption. + * + * when GPI or CPI is involved. Both of them requires extra key(s) to create + * the hashtable (partitionOid and bucketid to be exact). */ -TIDBitmap* TbmCreate(long maxbytes, bool is_ustore) +TIDBitmap* tbm_create(long maxbytes, bool is_global_part, bool is_crossbucket, bool is_ustore) { TIDBitmap* tbm = NULL; + bool complex_key = (is_global_part || is_crossbucket); long nbuckets; /* Create the TIDBitmap struct and zero all its fields */ @@ -217,19 +282,33 @@ TIDBitmap* TbmCreate(long maxbytes, bool is_ustore) tbm->mcxt = CurrentMemoryContext; tbm->status = TBM_EMPTY; - tbm->isGlobalPart = false; + + /* + * Fill TBM handlers base on the complexity of the keys. + * If the context requires complementary keys like partitionOid or + * bucketid, we use generic dynamichash table to accomodate bitmap. + * Otherwise, we use a more cache-friendly hash table to do the + * trick. + */ + if (!complex_key) { + tbm_init_handlers(tbm); + } else { + tbm_init_handlers(tbm); + } + /* * Estimate number of hashtable entries we can have within maxbytes. This - * estimates the hash overhead at MAXALIGN(sizeof(HASHELEMENT)) plus a - * pointer per hash entry, which is crude but good enough for our purpose. - * Also count an extra Pointer per entry for the arrays created during - * iteration readout. + * estimates the hash cost as at sizeof(PagetableEntry), which is good enough + * for our purpose. Alse count an extra pointer per hash entry for the arrays + * created during iteration readout. */ - nbuckets = maxbytes / - (MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(sizeof(PagetableEntry)) + sizeof(Pointer) + sizeof(Pointer)); - nbuckets = Min(nbuckets, INT_MAX - 1); /* safety limit */ - nbuckets = Max(nbuckets, 16); /* sanity limit */ + nbuckets = tbm_calculate_entries(maxbytes, complex_key); tbm->maxentries = (int)nbuckets; + tbm->lossify_start = 0; + + /* Set TBM index & storage attributes */ + tbm->is_global_part = is_global_part; + tbm->is_crossbucket = is_crossbucket; tbm->is_ustore = is_ustore; if (is_ustore) { @@ -246,38 +325,76 @@ TIDBitmap* TbmCreate(long maxbytes, bool is_ustore) } /* - * Actually create the hashtable. Since this is a moderately expensive - * proposition, we don't do it until we have to. + * Tid bitmap handler initializer. + * + * initialize templated utility tbm handlers, so that the caller can invoke. */ -static void tbm_create_pagetable(TIDBitmap* tbm) +TBM_TEMPLATE static void tbm_init_handlers(TIDBitmap* tbm) { - HASHCTL hash_ctl; - int rc = 0; + tbm->handler._add_tuples = tbm_add_tuples; + tbm->handler._add_page= tbm_add_page; + + tbm->handler._union = tbm_union; + tbm->handler._intersect = tbm_intersect; + + tbm->handler._begin_iterate = tbm_begin_iterate; +} + +/* + * Get bitmap handler. + * + * get templated utility tbm handlers, so that the caller can invoke. + */ +TBMHandler tbm_get_handler(TIDBitmap* tbm) +{ + return tbm->handler; +} +/* + * Actually create the hashtable. + * + * Since this is a moderately expensive proposition, we don't do it until we have to. + */ +TBM_TEMPLATE static void tbm_create_pagetable(TIDBitmap* tbm) +{ + errno_t rc = EOK; Assert(tbm->status != TBM_HASH); Assert(tbm->pagetable == NULL); - /* Create the hashtable proper */ - rc = memset_s(&hash_ctl, sizeof(hash_ctl), 0, sizeof(hash_ctl)); - securec_check(rc, "", ""); - hash_ctl.keysize = sizeof(PagetableEntryNode); - hash_ctl.entrysize = sizeof(PagetableEntry); - hash_ctl.hash = tag_hash; - hash_ctl.hcxt = tbm->mcxt; - tbm->pagetable = hash_create("TIDBitmap", - 128, /* start small and extend */ - &hash_ctl, - HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT); + if (type == TBM_SIMPLE_HASH) { + tbm->simple_pagetable = (struct pagetable_hash*)pagetable_create(tbm->mcxt, 128, tbm); + } else { + /* Create the hashtable proper */ + HASHCTL hash_ctl; + rc = memset_s(&hash_ctl, sizeof(hash_ctl), 0, sizeof(hash_ctl)); + securec_check(rc, "", ""); + hash_ctl.keysize = sizeof(PagetableEntryNode); + hash_ctl.entrysize = sizeof(PagetableEntry); + hash_ctl.hash = tbm_hash_complex_key; + hash_ctl.hcxt = tbm->mcxt; + tbm->pagetable = hash_create("TIDBitmap", 128, /* start small and extend */ + &hash_ctl, HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT); + } /* If entry1 is valid, push it into the hashtable */ if (tbm->status == TBM_ONE_PAGE) { PagetableEntry* page = NULL; bool found = false; + char oldstatus; - page = (PagetableEntry*)hash_search(tbm->pagetable, (void*)&tbm->entry1.entryNode, HASH_ENTER, &found); - Assert(!found); - errno_t rc = memcpy_s(page, sizeof(PagetableEntry), &tbm->entry1, sizeof(PagetableEntry)); - securec_check(rc, "\0", "\0"); + if (type == TBM_SIMPLE_HASH) { + page = pagetable_insert(tbm->simple_pagetable, tbm->entry1.entryNode.blockNo, &found); + Assert(!found); + oldstatus = page->status; + rc = memcpy_s(page, sizeof(PagetableEntry), &tbm->entry1, sizeof(PagetableEntry)); + securec_check(rc, "\0", "\0"); + page->status = oldstatus; + } else { + page = (PagetableEntry *)hash_search(tbm->pagetable, (void *)&tbm->entry1.entryNode, HASH_ENTER, &found); + Assert(!found); + rc = memcpy_s(page, sizeof(PagetableEntry), &tbm->entry1, sizeof(PagetableEntry)); + securec_check(rc, "\0", "\0"); + } } tbm->status = TBM_HASH; @@ -291,6 +408,9 @@ void tbm_free(TIDBitmap* tbm) if (tbm->pagetable != NULL) { hash_destroy(tbm->pagetable); } + if (tbm->simple_pagetable != NULL) { + pagetable_destroy(tbm->simple_pagetable); + } if (tbm->spages != NULL) { pfree_ext(tbm->spages); } @@ -301,19 +421,31 @@ void tbm_free(TIDBitmap* tbm) } /* + * tbm_calculate_entries + * * Estimate number of hashtable entries we can have within maxbytes. + * complex_keys is set when evaluating bitmaps with partitioned + * relations (e.g GPI, CBI etc.) */ -long tbm_calculate_entries(double maxbytes) -{ +long tbm_calculate_entries(double maxbytes, bool complex_keys) +{ + long nbuckets; + /* - * This estimates the hash cost as sizeof(PagetableEntry), which is good enough - * for our purpose. Also count an extra Pointer per entry for the arrays created - * during iteration readout. + * Estimate number of hashtable entries we can have within maxbytes. This + * estimates the hash cost as sizeof(PagetableEntry), which is good enough + * for our purpose. Also count an extra Pointer per entry for the arrays + * created during iteration readout. */ - long nbuckets = maxbytes / (sizeof(PagetableEntry) + sizeof(Pointer) + sizeof(Pointer)); + if (complex_keys) { + nbuckets = maxbytes / + (MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(sizeof(PagetableEntry)) + sizeof(Pointer) + sizeof(Pointer)); + } else { + nbuckets = maxbytes / (sizeof(PagetableEntry) + sizeof(Pointer) + sizeof(Pointer)); + } + nbuckets = Min(nbuckets, INT_MAX - 1); /* safety limit */ - const int max_buckets_lower_limit = 16; - nbuckets = Max(nbuckets, max_buckets_lower_limit); /* sanity limit */ + nbuckets = Max(nbuckets, 16); /* sanity limit */ return nbuckets; } @@ -324,7 +456,7 @@ long tbm_calculate_entries(double maxbytes) * If recheck is true, then the recheck flag will be set in the * TBMIterateResult when any of these tuples are reported out. */ -void tbm_add_tuples(TIDBitmap* tbm, const ItemPointer tids, int ntids, bool recheck, Oid partitionOid, int2 bucketid) +TBM_TEMPLATE void tbm_add_tuples(TIDBitmap* tbm, const ItemPointer tids, int ntids, bool recheck, Oid partitionOid, int2 bucketid) { int i; @@ -344,11 +476,11 @@ void tbm_add_tuples(TIDBitmap* tbm, const ItemPointer tids, int ntids, bool rech errmsg("tuple offset out of range: %u", off))); } - if (tbm_page_is_lossy(tbm, pageNode)) { + if (tbm_page_is_lossy(tbm, pageNode)) { continue; /* whole page is already marked */ } - page = tbm_get_pageentry(tbm, pageNode); + page = tbm_get_pageentry(tbm, pageNode); if (page->ischunk) { /* The page is a lossy chunk header, set bit for itself */ @@ -362,7 +494,7 @@ void tbm_add_tuples(TIDBitmap* tbm, const ItemPointer tids, int ntids, bool rech page->recheck |= recheck; if (tbm->nentries > tbm->maxentries) { - tbm_lossify(tbm); + tbm_lossify(tbm); } } } @@ -373,14 +505,14 @@ void tbm_add_tuples(TIDBitmap* tbm, const ItemPointer tids, int ntids, bool rech * This causes the whole page to be reported (with the recheck flag) * when the TIDBitmap is scanned. */ -void tbm_add_page(TIDBitmap* tbm, BlockNumber pageno, Oid partitionOid, int2 bucketid) +TBM_TEMPLATE void tbm_add_page(TIDBitmap* tbm, BlockNumber pageno, Oid partitionOid, int2 bucketid) { PagetableEntryNode pnode = {pageno, partitionOid, bucketid}; /* Enter the page in the bitmap, or mark it lossy if already present */ - tbm_mark_page_lossy(tbm, pnode); + tbm_mark_page_lossy(tbm, pnode); /* If we went over the memory limit, lossify some more pages */ if (tbm->nentries > tbm->maxentries) { - tbm_lossify(tbm); + tbm_lossify(tbm); } } @@ -389,8 +521,9 @@ void tbm_add_page(TIDBitmap* tbm, BlockNumber pageno, Oid partitionOid, int2 buc * * a is modified in-place, b is not changed */ -void tbm_union(TIDBitmap* a, const TIDBitmap* b) +TBM_TEMPLATE void tbm_union(TIDBitmap* a, const TIDBitmap* b) { + PagetableEntry* bpage = NULL; Assert(!a->iterating); /* Nothing to do if b is empty */ if (b->nentries == 0) { @@ -398,21 +531,29 @@ void tbm_union(TIDBitmap* a, const TIDBitmap* b) } /* Scan through chunks and pages in b, merge into a */ if (b->status == TBM_ONE_PAGE) { - tbm_union_page(a, &b->entry1); + tbm_union_page(a, &b->entry1); + return; + } + + Assert(b->status == TBM_HASH); + + if (type == TBM_SIMPLE_HASH) { + pagetable_iterator i; + pagetable_start_iterate(b->simple_pagetable, &i); + while ((bpage = pagetable_iterate(b->simple_pagetable, &i)) != NULL) { + tbm_union_page(a, bpage); + } } else { HASH_SEQ_STATUS status; - PagetableEntry* bpage = NULL; - - Assert(b->status == TBM_HASH); hash_seq_init(&status, b->pagetable); while ((bpage = (PagetableEntry*)hash_seq_search(&status)) != NULL) { - tbm_union_page(a, bpage); + tbm_union_page(a, bpage); } - } + } } /* Process one page of b during a union op */ -static void tbm_union_page(TIDBitmap* a, const PagetableEntry* bpage) +TBM_TEMPLATE static void tbm_union_page(TIDBitmap* a, const PagetableEntry* bpage) { PagetableEntry* apage = NULL; int wordnum; @@ -429,18 +570,18 @@ static void tbm_union_page(TIDBitmap* a, const PagetableEntry* bpage) while (w != 0) { if (w & 1) { PagetableEntryNode unionNode = {pg, bpage->entryNode.partitionOid, bpage->entryNode.bucketid}; - tbm_mark_page_lossy(a, unionNode); + tbm_mark_page_lossy(a, unionNode); } pg++; w >>= 1; } } } - } else if (tbm_page_is_lossy(a, bpage->entryNode)) { + } else if (tbm_page_is_lossy(a, bpage->entryNode)) { /* page is already lossy in a, nothing to do */ return; } else { - apage = tbm_get_pageentry(a, bpage->entryNode); + apage = tbm_get_pageentry(a, bpage->entryNode); if (apage->ischunk) { /* The page is a lossy chunk header, set bit for itself */ apage->words[0] |= ((bitmapword)1 << 0); @@ -454,7 +595,7 @@ static void tbm_union_page(TIDBitmap* a, const PagetableEntry* bpage) } if (a->nentries > a->maxentries) { - tbm_lossify(a); + tbm_lossify(a); } } @@ -463,16 +604,19 @@ static void tbm_union_page(TIDBitmap* a, const PagetableEntry* bpage) * * a is modified in-place, b is not changed */ -void tbm_intersect(TIDBitmap* a, const TIDBitmap* b) -{ +TBM_TEMPLATE void tbm_intersect(TIDBitmap* a, const TIDBitmap* b) +{ + PagetableEntry* apage = NULL; + Assert(!a->iterating); /* Nothing to do if a is empty */ if (a->nentries == 0) { return; } + /* Scan through chunks and pages in a, try to match to b */ if (a->status == TBM_ONE_PAGE) { - if (tbm_intersect_page(a, &a->entry1, b)) { + if (tbm_intersect_page(a, &a->entry1, b)) { /* Page is now empty, remove it from a */ Assert(!a->entry1.ischunk); a->npages--; @@ -480,14 +624,17 @@ void tbm_intersect(TIDBitmap* a, const TIDBitmap* b) Assert(a->nentries == 0); a->status = TBM_EMPTY; } - } else { - HASH_SEQ_STATUS status; - PagetableEntry* apage = NULL; + return; + } - Assert(a->status == TBM_HASH); - hash_seq_init(&status, a->pagetable); - while ((apage = (PagetableEntry*)hash_seq_search(&status)) != NULL) { - if (tbm_intersect_page(a, apage, b)) { + Assert(a->status == TBM_HASH); + + if (type == TBM_SIMPLE_HASH) { + pagetable_iterator i; + + pagetable_start_iterate(a->simple_pagetable, &i); + while ((apage = pagetable_iterate(a->simple_pagetable, &i)) != NULL) { + if (tbm_intersect_page(a, apage, b)) { /* Page or chunk is now empty, remove it from a */ if (apage->ischunk) { a->nchunks--; @@ -495,10 +642,29 @@ void tbm_intersect(TIDBitmap* a, const TIDBitmap* b) a->npages--; } a->nentries--; - if (hash_search(a->pagetable, (void*)&apage->entryNode, HASH_REMOVE, NULL) == NULL) { + if (!pagetable_delete(a->simple_pagetable,apage->entryNode.blockNo)) { ereport(ERROR, - (errcode(ERRCODE_DATA_CORRUPTED), errmodule(MOD_EXECUTOR), errmsg("hash table corrupted"))); - } + (errcode(ERRCODE_DATA_CORRUPTED), errmodule(MOD_EXECUTOR), errmsg("hash table corrupted"))); + } + } + } + } else { + HASH_SEQ_STATUS status; + + hash_seq_init(&status, a->pagetable); + while ((apage = (PagetableEntry *)hash_seq_search(&status)) != NULL) { + if (tbm_intersect_page(a, apage, b)) { + /* Page or chunk is now empty, remove it from a */ + if (apage->ischunk) { + a->nchunks--; + } else { + a->npages--; + } + a->nentries--; + if (hash_search(a->pagetable, (void *)&apage->entryNode, HASH_REMOVE, NULL) == NULL) { + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), errmodule(MOD_EXECUTOR), errmsg("hash table corrupted"))); + } } } } @@ -509,7 +675,7 @@ void tbm_intersect(TIDBitmap* a, const TIDBitmap* b) * * Returns TRUE if apage is now empty and should be deleted from a */ -static bool tbm_intersect_page(TIDBitmap* a, PagetableEntry* apage, const TIDBitmap* b) +TBM_TEMPLATE static bool tbm_intersect_page(TIDBitmap* a, PagetableEntry* apage, const TIDBitmap* b) { const PagetableEntry* bpage = NULL; int wordnum; @@ -533,7 +699,7 @@ static bool tbm_intersect_page(TIDBitmap* a, PagetableEntry* apage, const TIDBit while (w != 0) { if (w & 1) { PagetableEntryNode pNode = {pg, apage->entryNode.partitionOid, apage->entryNode.bucketid}; - if (!tbm_page_is_lossy(b, pNode) && tbm_find_pageentry(b, pNode) == NULL) { + if (!tbm_page_is_lossy(b, pNode) && tbm_find_pageentry(b, pNode) == NULL) { /* Page is not in b at all, lose lossy bit */ neww &= ~((bitmapword)1 << (unsigned int)bitnum); } @@ -549,7 +715,7 @@ static bool tbm_intersect_page(TIDBitmap* a, PagetableEntry* apage, const TIDBit } } return candelete; - } else if (tbm_page_is_lossy(b, apage->entryNode)) { + } else if (tbm_page_is_lossy(b, apage->entryNode)) { /* * Some of the tuples in 'a' might not satisfy the quals for 'b', but * because the page 'b' is lossy, we don't know which ones. Therefore @@ -561,7 +727,7 @@ static bool tbm_intersect_page(TIDBitmap* a, PagetableEntry* apage, const TIDBit } else { bool candelete = true; - bpage = tbm_find_pageentry(b, apage->entryNode); + bpage = tbm_find_pageentry(b, apage->entryNode); if (bpage != NULL) { /* Both pages are exact, merge at the bit level */ Assert(!bpage->ischunk); @@ -599,7 +765,7 @@ bool tbm_is_empty(const TIDBitmap* tbm) * of the bitmap. However, you can call this multiple times to scan the * contents repeatedly, including parallel scans. */ -TBMIterator* tbm_begin_iterate(TIDBitmap* tbm) +TBM_TEMPLATE TBMIterator* tbm_begin_iterate(TIDBitmap* tbm) { TBMIterator* iterator = NULL; @@ -624,7 +790,6 @@ TBMIterator* tbm_begin_iterate(TIDBitmap* tbm) * than one iterator. */ if (tbm->status == TBM_HASH && !tbm->iterating) { - HASH_SEQ_STATUS status; PagetableEntry* page = NULL; int npages; int nchunks; @@ -635,23 +800,38 @@ TBMIterator* tbm_begin_iterate(TIDBitmap* tbm) if ((tbm->schunks == NULL) && tbm->nchunks > 0) { tbm->schunks = (PagetableEntry**)MemoryContextAlloc(tbm->mcxt, tbm->nchunks * sizeof(PagetableEntry*)); } - - hash_seq_init(&status, tbm->pagetable); + npages = nchunks = 0; - while ((page = (PagetableEntry*)hash_seq_search(&status)) != NULL) { - if (page->ischunk) { - tbm->schunks[nchunks++] = page; - } else { - tbm->spages[npages++] = page; + if (type == TBM_SIMPLE_HASH) { + pagetable_iterator i; + pagetable_start_iterate(tbm->simple_pagetable, &i); + while ((page = pagetable_iterate(tbm->simple_pagetable, &i)) != NULL) { + if (page->ischunk) { + tbm->schunks[nchunks++] = page; + } else { + tbm->spages[npages++] = page; + } + } + } else { + /* make TBM_DYNAMIC_HASH a default*/ + HASH_SEQ_STATUS status; + hash_seq_init(&status, tbm->pagetable); + while ((page = (PagetableEntry *)hash_seq_search(&status)) != NULL) { + if (page->ischunk) { + tbm->schunks[nchunks++] = page; + } else { + tbm->spages[npages++] = page; + } } } + Assert(npages == tbm->npages); Assert(nchunks == tbm->nchunks); if (npages > 1) { - qsort(tbm->spages, npages, sizeof(PagetableEntry*), tbm_comparator); + qsort(tbm->spages, npages, sizeof(PagetableEntry*), tbm_comparator); } if (nchunks > 1) { - qsort(tbm->schunks, nchunks, sizeof(PagetableEntry*), tbm_comparator); + qsort(tbm->schunks, nchunks, sizeof(PagetableEntry*), tbm_comparator); } } @@ -711,11 +891,11 @@ TBMIterateResult* tbm_iterate(TBMIterator* iterator) */ if (iterator->schunkptr < tbm->nchunks) { PagetableEntry* chunk = tbm->schunks[iterator->schunkptr]; - PagetableEntryNode pnode; - pnode.blockNo = chunk->entryNode.blockNo + iterator->schunkbit; - pnode.partitionOid = chunk->entryNode.partitionOid; - pnode.bucketid = chunk->entryNode.bucketid; - pnode.padding = chunk->entryNode.padding; + PagetableEntryNode pnode = { + chunk->entryNode.blockNo + iterator->schunkbit, + chunk->entryNode.partitionOid, + chunk->entryNode.bucketid + }; if (iterator->spageptr >= tbm->npages || IS_CHUNK_BEFORE_PAGE(pnode, tbm->spages[iterator->spageptr]->entryNode)) { /* Return a lossy page indicator from the chunk */ @@ -788,7 +968,7 @@ void tbm_end_iterate(TBMIterator* iterator) * * Returns NULL if there is no non-lossy entry for the pageno. */ -static const PagetableEntry* tbm_find_pageentry(const TIDBitmap* tbm, PagetableEntryNode pageNode) +TBM_TEMPLATE static const PagetableEntry* tbm_find_pageentry(const TIDBitmap* tbm, PagetableEntryNode pageNode) { const PagetableEntry* page = NULL; @@ -805,7 +985,12 @@ static const PagetableEntry* tbm_find_pageentry(const TIDBitmap* tbm, PagetableE return page; } - page = (PagetableEntry*)hash_search(tbm->pagetable, (void*)&pageNode, HASH_FIND, NULL); + if (type == TBM_SIMPLE_HASH) { + page = pagetable_lookup(tbm->simple_pagetable, pageNode.blockNo); + } else { + page = (PagetableEntry*)hash_search(tbm->pagetable, (void*)&pageNode, HASH_FIND, NULL); + } + if (page == NULL) { return NULL; } @@ -823,7 +1008,7 @@ static const PagetableEntry* tbm_find_pageentry(const TIDBitmap* tbm, PagetableE * This may cause the table to exceed the desired memory size. It is * up to the caller to call tbm_lossify() at the next safe point if so. */ -static PagetableEntry* tbm_get_pageentry(TIDBitmap* tbm, PagetableEntryNode pageNode) +TBM_TEMPLATE static PagetableEntry* tbm_get_pageentry(TIDBitmap* tbm, PagetableEntryNode pageNode) { PagetableEntry* page = NULL; bool found = false; @@ -841,17 +1026,30 @@ static PagetableEntry* tbm_get_pageentry(TIDBitmap* tbm, PagetableEntryNode page return page; } /* Time to switch from one page to a hashtable */ - tbm_create_pagetable(tbm); + tbm_create_pagetable(tbm); } /* Look up or create an entry */ - page = (PagetableEntry*)hash_search(tbm->pagetable, (void*)&pageNode, HASH_ENTER, &found); + if (type == TBM_SIMPLE_HASH) { + page = pagetable_insert(tbm->simple_pagetable, pageNode.blockNo, &found); + } else { + /* make TBM_DYNAMIC_HASH a default */ + page = (PagetableEntry*)hash_search(tbm->pagetable, (void*)&pageNode, HASH_ENTER, &found); + } } /* Initialize it if not present before */ if (!found) { - rc = memset_s(page, sizeof(PagetableEntry), 0, sizeof(PagetableEntry)); - securec_check(rc, "", ""); + char oldstatus; + if (type == TBM_SIMPLE_HASH) { + oldstatus = page->status; + rc = memset_s(page, sizeof(PagetableEntry), 0, sizeof(PagetableEntry)); + securec_check(rc, "", ""); + page->status = oldstatus; + } else { + rc = memset_s(page, sizeof(PagetableEntry), 0, sizeof(PagetableEntry)); + securec_check(rc, "", ""); + } page->entryNode.blockNo = pageNode.blockNo; page->entryNode.partitionOid = pageNode.partitionOid; page->entryNode.bucketid = pageNode.bucketid; @@ -866,7 +1064,7 @@ static PagetableEntry* tbm_get_pageentry(TIDBitmap* tbm, PagetableEntryNode page /* * tbm_page_is_lossy - is the page marked as lossily stored? */ -static bool tbm_page_is_lossy(const TIDBitmap* tbm, PagetableEntryNode pageNode) +TBM_TEMPLATE static bool tbm_page_is_lossy(const TIDBitmap* tbm, PagetableEntryNode pageNode) { PagetableEntry* page = NULL; BlockNumber chunkPageNo; @@ -881,7 +1079,13 @@ static bool tbm_page_is_lossy(const TIDBitmap* tbm, PagetableEntryNode pageNode) bitno = pageNode.blockNo % tbm->pages_per_chunk; chunkPageNo = pageNode.blockNo - bitno; PagetableEntryNode chunkNode = {chunkPageNo, pageNode.partitionOid, pageNode.bucketid}; - page = (PagetableEntry*)hash_search(tbm->pagetable, (void*)&chunkNode, HASH_FIND, NULL); + + if (type == TBM_SIMPLE_HASH) { + page = pagetable_lookup(tbm->simple_pagetable, chunkNode.blockNo); + } else { + page = (PagetableEntry*)hash_search(tbm->pagetable, (void*)&chunkNode, HASH_FIND, NULL); + } + if (page != NULL && page->ischunk) { int wordnum = WORDNUM(bitno); int bitnum = BITNUM(bitno); @@ -899,19 +1103,20 @@ static bool tbm_page_is_lossy(const TIDBitmap* tbm, PagetableEntryNode pageNode) * This may cause the table to exceed the desired memory size. It is * up to the caller to call tbm_lossify() at the next safe point if so. */ -static void tbm_mark_page_lossy(TIDBitmap* tbm, PagetableEntryNode pageNode) +TBM_TEMPLATE static void tbm_mark_page_lossy(TIDBitmap* tbm, PagetableEntryNode pageNode) { PagetableEntry* page = NULL; bool found = false; + bool deleted = false; BlockNumber chunkPageNo; int bitno; int wordnum; int bitnum; - int rc = 0; + int rc = 0; /* We force the bitmap into hashtable mode whenever it's lossy */ if (tbm->status != TBM_HASH) { - tbm_create_pagetable(tbm); + tbm_create_pagetable(tbm); } bitno = pageNode.blockNo % tbm->pages_per_chunk; @@ -922,7 +1127,13 @@ static void tbm_mark_page_lossy(TIDBitmap* tbm, PagetableEntryNode pageNode) * chunk header, however, we skip this and handle the case below. */ if (bitno != 0) { - if (hash_search(tbm->pagetable, (void*)&pageNode, HASH_REMOVE, NULL) != NULL) { + if (type == TBM_SIMPLE_HASH) { + deleted = pagetable_delete(tbm->simple_pagetable, pageNode.blockNo); + } else { + deleted = (hash_search(tbm->pagetable, (void*)&pageNode, HASH_REMOVE, NULL) != NULL); + } + + if(deleted) { /* It was present, so adjust counts */ tbm->nentries--; tbm->npages--; /* assume it must have been non-lossy */ @@ -930,12 +1141,25 @@ static void tbm_mark_page_lossy(TIDBitmap* tbm, PagetableEntryNode pageNode) } /* Look up or create entry for chunk-header page */ - page = (PagetableEntry*)hash_search(tbm->pagetable, (void*)&chunkNode, HASH_ENTER, &found); + if (type == TBM_SIMPLE_HASH) { + page = pagetable_insert(tbm->simple_pagetable, chunkNode.blockNo, &found); + } else { + /* make TBM_DYNAMIC_HASH a default */ + page = (PagetableEntry*)hash_search(tbm->pagetable, (void*)&chunkNode, HASH_ENTER, &found); + } /* Initialize it if not present before */ if (!found) { - rc = memset_s(page, sizeof(PagetableEntry), 0, sizeof(PagetableEntry)); - securec_check(rc, "", ""); + char oldstatus; + if (type == TBM_SIMPLE_HASH) { + oldstatus = page->status; + rc = memset_s(page, sizeof(PagetableEntry), 0, sizeof(PagetableEntry)); + securec_check(rc, "", ""); + page->status = oldstatus; + } else { + rc = memset_s(page, sizeof(PagetableEntry), 0, sizeof(PagetableEntry)); + securec_check(rc, "", ""); + } page->entryNode = chunkNode; page->ischunk = true; /* must count it too */ @@ -943,8 +1167,16 @@ static void tbm_mark_page_lossy(TIDBitmap* tbm, PagetableEntryNode pageNode) tbm->nchunks++; } else if (!page->ischunk) { /* chunk header page was formerly non-lossy, make it lossy */ - rc = memset_s(page, sizeof(PagetableEntry), 0, sizeof(PagetableEntry)); - securec_check(rc, "", ""); + char oldstatus; + if (type == TBM_SIMPLE_HASH) { + oldstatus = page->status; + rc = memset_s(page, sizeof(PagetableEntry), 0, sizeof(PagetableEntry)); + securec_check(rc, "", ""); + page->status = oldstatus; + } else { + rc = memset_s(page, sizeof(PagetableEntry), 0, sizeof(PagetableEntry)); + securec_check(rc, "", ""); + } page->entryNode = chunkNode; page->ischunk = true; /* we assume it had some tuple bit(s) set, so mark it lossy */ @@ -963,11 +1195,8 @@ static void tbm_mark_page_lossy(TIDBitmap* tbm, PagetableEntryNode pageNode) /* * tbm_lossify - lose some information to get back under the memory limit */ -static void tbm_lossify(TIDBitmap* tbm) +TBM_TEMPLATE static void tbm_lossify(TIDBitmap* tbm) { - HASH_SEQ_STATUS status; - PagetableEntry* page = NULL; - /* * XXX Really stupid implementation: this just lossifies pages in * essentially random order. We should be paying some attention to the @@ -980,33 +1209,11 @@ static void tbm_lossify(TIDBitmap* tbm) Assert(!tbm->iterating); Assert(tbm->status == TBM_HASH); - hash_seq_init(&status, tbm->pagetable); - while ((page = (PagetableEntry*)hash_seq_search(&status)) != NULL) { - if (page->ischunk) { - continue; /* already a chunk header */ - } - /* - * If the page would become a chunk header, we won't save anything by - * converting it to lossy, so skip it. - */ - if ((page->entryNode.blockNo % tbm->pages_per_chunk) == 0) { - continue; - } - - /* This does the dirty work ... */ - tbm_mark_page_lossy(tbm, page->entryNode); - - if (tbm->nentries <= tbm->maxentries / 2) { - /* we have done enough */ - hash_seq_term(&status); - break; - } - - /* - * Note: tbm_mark_page_lossy may have inserted a lossy chunk into the - * hashtable. We can continue the same seq_search scan since we do - * not care whether we visit lossy chunks or not. - */ + if (type == TBM_SIMPLE_HASH) { + tbm_lossify_simple_iterate(tbm); + } else { + /* make TBM_DYNAMIC_HASH a default */ + tbm_lossify_generic_iterate(tbm); } /* @@ -1024,46 +1231,133 @@ static void tbm_lossify(TIDBitmap* tbm) } } +TBM_TEMPLATE static inline void tbm_lossify_generic_iterate(TIDBitmap* tbm) +{ + HASH_SEQ_STATUS status; + PagetableEntry* page = NULL; + + hash_seq_init(&status, tbm->pagetable); + while ((page = (PagetableEntry*)hash_seq_search(&status)) != NULL) { + if (page->ischunk) { + continue; /* already a chunk header */ + } + /* + * If the page would become a chunk header, we won't save anything by + * converting it to lossy, so skip it. + */ + if ((page->entryNode.blockNo % tbm->pages_per_chunk) == 0) { + continue; + } + + /* This does the dirty work ... */ + tbm_mark_page_lossy(tbm, page->entryNode); + + if (tbm->nentries <= tbm->maxentries / 2) { + /* we have done enough */ + hash_seq_term(&status); + break; + } + + /* + * Note: tbm_mark_page_lossy may have inserted a lossy chunk into the + * hashtable. We can continue the same seq_search scan since we do + * not care whether we visit lossy chunks or not. + */ + } +} + +TBM_TEMPLATE static inline void tbm_lossify_simple_iterate(TIDBitmap* tbm) +{ + pagetable_iterator i; + PagetableEntry* page = NULL; + + pagetable_start_iterate_at(tbm->simple_pagetable, &i, tbm->lossify_start); + while ((page = pagetable_iterate(tbm->simple_pagetable, &i)) != NULL) { + if (page->ischunk) { + continue; /* already a chunk header */ + } + /* + * If the page would become a chunk header, we won't save anything by + * converting it to lossy, so skip it. + */ + if ((page->entryNode.blockNo % tbm->pages_per_chunk) == 0) { + continue; + } + + /* This does the dirty work ... */ + tbm_mark_page_lossy(tbm, page->entryNode); + + if (tbm->nentries <= tbm->maxentries / 2) { + /* + * we have made enough room. Remember where to start lossifying + * next round, so we evenly iterate over the hashtable. + */ + tbm->lossify_start = i.cur; + break; + } + + /* + * Note: tbm_mark_page_lossy may have inserted a lossy chunk into the + * hashtable and may have deleted the non-lossy chunk. We can + * continue the same hash table scan, since failure to visit one + * element or visiting the newly inserted element,isn't fatal. + */ + } +} + + /* * qsort comparator to handle PagetableEntry pointers. */ -static int tbm_comparator(const void* left, const void* right) +TBM_TEMPLATE static int tbm_comparator(const void* left, const void* right) { PagetableEntryNode l = (*((PagetableEntry* const*)left))->entryNode; PagetableEntryNode r = (*((PagetableEntry* const*)right))->entryNode; - if (l.partitionOid < r.partitionOid) { - return -1; - } else if (l.partitionOid > r.partitionOid) { - return 1; - } else if (l.bucketid < r.bucketid) { - return -1; - } else if (l.bucketid > r.bucketid) { - return 1; - } else if (l.blockNo < r.blockNo) { - return -1; - } else if (l.blockNo > r.blockNo) { - return 1; + if (type == TBM_SIMPLE_HASH) { + if (l.blockNo < r.blockNo) { + return -1; + } else if (l.blockNo > r.blockNo) { + return 1; + } + } else { + if (l.partitionOid < r.partitionOid) { + return -1; + } else if (l.partitionOid > r.partitionOid) { + return 1; + } else if (l.bucketid < r.bucketid) { + return -1; + } else if (l.bucketid > r.bucketid) { + return 1; + } else if (l.blockNo < r.blockNo) { + return -1; + } else if (l.blockNo > r.blockNo) { + return 1; + } } return 0; } +/* + * check if the tid bitmap for global index. + */ bool tbm_is_global(const TIDBitmap* tbm) { - return tbm->isGlobalPart; + return tbm->is_global_part; } -void tbm_set_global(TIDBitmap* tbm, bool isGlobal) +/* + * set tid bitmap is for global index. + */ +void tbm_set_global(TIDBitmap* tbm, bool val) { - tbm->isGlobalPart = isGlobal; + tbm->is_global_part = val; } +/* + * check if the tid bitmap for crossbucket index. + */ bool tbm_is_crossbucket(const TIDBitmap* tbm) { - return tbm->crossbucket; -} - -void tbm_set_crossbucket(TIDBitmap* tbm, bool crossbucket) -{ - tbm->crossbucket = crossbucket; + return tbm->is_crossbucket; } diff --git a/src/common/backend/utils/mmgr/mcxt.cpp b/src/common/backend/utils/mmgr/mcxt.cpp index f4b722e12..b2e8a4985 100644 --- a/src/common/backend/utils/mmgr/mcxt.cpp +++ b/src/common/backend/utils/mmgr/mcxt.cpp @@ -1137,6 +1137,67 @@ void* MemoryContextAllocZeroAlignedDebug(MemoryContext context, Size size, const return ret; } + +/* + * MemoryContextAllocExtended + * Allocate space within the specified context using the given flags. + * + * This method supports all three memory allocation flags which makes it + * suitable for almost all circumstances. + */ +void* MemoryContextAllocExtendedDebug(MemoryContext context, Size size, int flags, const char* file, int line) +{ + void* ret = NULL; + bool allocsz_is_valid = false; + + Assert(MemoryContextIsValid(context)); +#ifdef MEMORY_CONTEXT_CHECKING + PreventActionOnSealedContext(context); +#endif + + /* Make sure memory allocation size is valid. */ + if ((flags & MCXT_ALLOC_HUGE) != 0) { + allocsz_is_valid = AllocHugeSizeIsValid(size); + } else { + allocsz_is_valid = AllocSizeIsValid(size); + } + + if (!allocsz_is_valid) { + ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid memory alloc request size %lu in %s:%d", (unsigned long)size, file, line))); + } + + context->isReset = false; + + /* Invoke memory allocator */ + ret = (*context->methods->alloc)(context, 0, size, file, line); + if ((flags & MCXT_ALLOC_NO_OOM) != 0) { + /* Do nothing */ + } else if (unlikely(ret == NULL)) { + ereport(ERROR, (errcode(ERRCODE_OUT_OF_LOGICAL_MEMORY), errmsg("memory is temporarily unavailable"), + errdetail("Failed on request of size %lu bytes under queryid %lu in %s:%d.", + (unsigned long)size, u_sess->debug_query_id, file, line))); + } + + /* Set aligned if MCXT_ALLOC_ZERO */ + if ((flags & MCXT_ALLOC_ZERO) != 0) { + MemSetAligned(ret, 0, size); + } + +#ifdef MEMORY_CONTEXT_CHECKING + /* check if the memory context is out of control */ + MemoryContextCheckMaxSize(context, size, file, line); +#endif + + /* check if the session used memory is beyond the limitation */ + if (unlikely(STATEMENT_MAX_MEM)) { + MemoryContextCheckSessionMemory(context, size, file, line); + } + InsertMemoryAllocInfo(ret, context, file, line, size); + + return ret; +} + /* * palloc_extended * palloc with flags, it will return NULL while OOM happend. diff --git a/src/gausskernel/optimizer/path/costsize.cpp b/src/gausskernel/optimizer/path/costsize.cpp index 6dfc60d29..4b90d7fca 100755 --- a/src/gausskernel/optimizer/path/costsize.cpp +++ b/src/gausskernel/optimizer/path/costsize.cpp @@ -1424,7 +1424,7 @@ bool has_lossy_pages(RelOptInfo *baserel, const double &pages_fetched, double &l */ double heap_pages = Min(pages_fetched, baserel->pages); const long work_mem_size = u_sess->attr.attr_memory.work_mem * 1024L; - long maxentries = tbm_calculate_entries(work_mem_size); + long maxentries = tbm_calculate_entries(work_mem_size, false); if (maxentries >= heap_pages) { return false; } diff --git a/src/gausskernel/runtime/executor/nodeBitmapAnd.cpp b/src/gausskernel/runtime/executor/nodeBitmapAnd.cpp index 6a72dbe42..d51aa96fe 100644 --- a/src/gausskernel/runtime/executor/nodeBitmapAnd.cpp +++ b/src/gausskernel/runtime/executor/nodeBitmapAnd.cpp @@ -112,8 +112,10 @@ Node* MultiExecBitmapAnd(BitmapAndState* node) */ for (i = 0; i < nplans; i++) { PlanState* subnode = bitmapplans[i]; - subnode->hbktScanSlot.currSlot = node->ps.hbktScanSlot.currSlot; TIDBitmap* subresult = NULL; + TBMHandler tbm_handler; + + subnode->hbktScanSlot.currSlot = node->ps.hbktScanSlot.currSlot; subresult = (TIDBitmap*)MultiExecProcNode(subnode); if (subresult == NULL || !IsA(subresult, TIDBitmap)) @@ -125,6 +127,8 @@ Node* MultiExecBitmapAnd(BitmapAndState* node) if (result == NULL) { result = subresult; /* first subplan */ } else { + /* get tbm handlers */ + tbm_handler = tbm_get_handler(result); /* * If the global tbm intersect with non-global tbm, * set the final result to non-global tbm. @@ -137,7 +141,7 @@ Node* MultiExecBitmapAnd(BitmapAndState* node) tbm_set_global(result, false); } - tbm_intersect(result, subresult); + tbm_handler._intersect(result, subresult); tbm_free(subresult); } diff --git a/src/gausskernel/runtime/executor/nodeBitmapHeapscan.cpp b/src/gausskernel/runtime/executor/nodeBitmapHeapscan.cpp index cf028dbd2..416e29645 100644 --- a/src/gausskernel/runtime/executor/nodeBitmapHeapscan.cpp +++ b/src/gausskernel/runtime/executor/nodeBitmapHeapscan.cpp @@ -258,6 +258,7 @@ static TupleTableSlot* BitmapHeapTblNext(BitmapHeapScanState* node) ExprContext* econtext = NULL; TableScanDesc scan = NULL; TIDBitmap* tbm = NULL; + TBMHandler tbm_handler; TBMIterator* tbmiterator = NULL; TBMIterateResult* tbmres = NULL; HBktTblScanDesc hpscan = NULL; @@ -301,6 +302,7 @@ static TupleTableSlot* BitmapHeapTblNext(BitmapHeapScanState* node) */ if (tbm == NULL) { tbm = (TIDBitmap*)MultiExecProcNode(outerPlanState(node)); + tbm_handler = tbm_get_handler(tbm); if (tbm == NULL || !IsA(tbm, TIDBitmap)) { ereport(ERROR, @@ -310,12 +312,12 @@ static TupleTableSlot* BitmapHeapTblNext(BitmapHeapScanState* node) } node->tbm = tbm; - node->tbmiterator = tbmiterator = tbm_begin_iterate(tbm); + node->tbmiterator = tbmiterator = tbm_handler._begin_iterate(tbm); node->tbmres = tbmres = NULL; #ifdef USE_PREFETCH if (u_sess->storage_cxt.target_prefetch_pages > 0) { - node->prefetch_iterator = prefetch_iterator = tbm_begin_iterate(tbm); + node->prefetch_iterator = prefetch_iterator = tbm_handler._begin_iterate(tbm); node->prefetch_pages = 0; node->prefetch_target = -1; } diff --git a/src/gausskernel/runtime/executor/nodeBitmapIndexscan.cpp b/src/gausskernel/runtime/executor/nodeBitmapIndexscan.cpp index 4136698fc..19073fe94 100644 --- a/src/gausskernel/runtime/executor/nodeBitmapIndexscan.cpp +++ b/src/gausskernel/runtime/executor/nodeBitmapIndexscan.cpp @@ -34,21 +34,6 @@ #include "nodes/makefuncs.h" static void ExecInitNextPartitionForBitmapIndexScan(BitmapIndexScanState* node); -/* If bitmapscan uses global partition index, set tbm to global */ -static inline void GPIUpdateTbmType(BitmapIndexScanState* node, TIDBitmap* tbm) -{ - if (RelationIsGlobalIndex(node->biss_RelationDesc)) { - tbm_set_global(tbm, true); - } -} - -/* if bitmapscan uses crossbucket index, set tbm->crossbucket to true */ -static inline void CBIUpdateTbmType(BitmapIndexScanState* node, TIDBitmap* tbm) -{ - if (RelationIsCrossBucketIndex(node->biss_RelationDesc)) { - tbm_set_crossbucket(tbm, true); - } -} /* ---------------------------------------------------------------- * MultiExecBitmapIndexScan(node) @@ -98,13 +83,9 @@ Node* MultiExecBitmapIndexScan(BitmapIndexScanState* node) node->biss_result = NULL; /* reset for next time */ } else { /* XXX should we use less than u_sess->attr.attr_memory.work_mem for this? */ - tbm = TbmCreate(u_sess->attr.attr_memory.work_mem * 1024L, isUstore); - - /* If bitmapscan uses global partition index, set tbm to global. */ - GPIUpdateTbmType(node, tbm); - - /* If bitmapscan uses crossbucket index, set tbm->crossbucket to true. */ - CBIUpdateTbmType(node, tbm); + long maxbytes = u_sess->attr.attr_memory.work_mem * 1024L; + tbm = tbm_create(maxbytes, RelationIsGlobalIndex(node->biss_RelationDesc), + RelationIsCrossBucketIndex(node->biss_RelationDesc), isUstore); } /* Cross-bucket index scan should not switch the index bucket. */ diff --git a/src/gausskernel/runtime/executor/nodeBitmapOr.cpp b/src/gausskernel/runtime/executor/nodeBitmapOr.cpp index 4145d629e..4dea450e4 100644 --- a/src/gausskernel/runtime/executor/nodeBitmapOr.cpp +++ b/src/gausskernel/runtime/executor/nodeBitmapOr.cpp @@ -126,15 +126,11 @@ Node* MultiExecBitmapOr(BitmapOrState* node) /* first subplan */ if (result == NULL) { /* XXX should we use less than u_sess->attr.attr_memory.work_mem for this? */ - result = TbmCreate(u_sess->attr.attr_memory.work_mem * 1024L, isUstore); - /* If bitmapscan uses global partition index, set tbm to global. */ - if (RelationIsGlobalIndex(((BitmapIndexScanState*)subnode)->biss_RelationDesc)) { - tbm_set_global(result, true); - } - /* If bitmapscan uses crossbucket index, set tbm to crossbucket. */ - if (RelationIsCrossBucketIndex(((BitmapIndexScanState*)subnode)->biss_RelationDesc)) { - tbm_set_crossbucket(result, true); - } + long maxbytes = u_sess->attr.attr_memory.work_mem * 1024L; + result = tbm_create(maxbytes, + RelationIsGlobalIndex(((BitmapIndexScanState *)subnode)->biss_RelationDesc), + RelationIsCrossBucketIndex(((BitmapIndexScanState *)subnode)->biss_RelationDesc), + isUstore); } ((BitmapIndexScanState*)subnode)->biss_result = result; @@ -157,13 +153,14 @@ Node* MultiExecBitmapOr(BitmapOrState* node) if (result == NULL) { result = subresult; /* first subplan */ } else { + TBMHandler tbm_handler = tbm_get_handler(result); if (tbm_is_global(result) != tbm_is_global(subresult)) { ereport(ERROR, (errcode(ERRCODE_UNRECOGNIZED_NODE_TYPE), errmsg( "do not support bitmap index scan for global index and local index simultaneously."))); } - tbm_union(result, subresult); + tbm_handler._union(result, subresult); tbm_free(subresult); } } diff --git a/src/gausskernel/storage/access/gin/gindatapage.cpp b/src/gausskernel/storage/access/gin/gindatapage.cpp index 926a561ed..842999f73 100644 --- a/src/gausskernel/storage/access/gin/gindatapage.cpp +++ b/src/gausskernel/storage/access/gin/gindatapage.cpp @@ -178,10 +178,11 @@ int GinDataLeafPageGetItemsToTbm(Page page, TIDBitmap *tbm) nitems = ginPostingListDecodeAllSegmentsToTbm(segment, len, tbm); } else { + TBMHandler tbm_handler = tbm_get_handler(tbm); uncompressed = dataLeafPageGetUncompressed(page, &nitems); if (nitems > 0) - tbm_add_tuples(tbm, uncompressed, nitems, false); + tbm_handler._add_tuples(tbm, uncompressed, nitems, false, InvalidOid, InvalidBktId); } return nitems; diff --git a/src/gausskernel/storage/access/gin/ginget.cpp b/src/gausskernel/storage/access/gin/ginget.cpp index d78776941..20d27ae91 100644 --- a/src/gausskernel/storage/access/gin/ginget.cpp +++ b/src/gausskernel/storage/access/gin/ginget.cpp @@ -132,10 +132,12 @@ static bool collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack, GinSca { OffsetNumber attnum; Form_pg_attribute attr; + TBMHandler tbm_handler; /* Initialize empty bitmap result */ if (!isColStore) { - scanEntry->matchBitmap = TbmCreate(u_sess->attr.attr_memory.work_mem * 1024L); + scanEntry->matchBitmap = tbm_create(u_sess->attr.attr_memory.work_mem * 1024L); + tbm_handler = tbm_get_handler(scanEntry->matchBitmap); } /* Null query cannot partial-match anything */ @@ -280,7 +282,7 @@ static bool collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack, GinSca ipd = ginReadTuple(btree->ginstate, scanEntry->attnum, itup, &nipd); if (!isColStore) { - tbm_add_tuples(scanEntry->matchBitmap, ipd, nipd, false); + tbm_handler._add_tuples(scanEntry->matchBitmap, ipd, nipd, false, InvalidOid, InvalidBktId); } else { if (scanEntry->matchList == NULL) { scanEntry->matchList = (ItemPointer)palloc(nipd * sizeof(ItemPointerData)); @@ -374,7 +376,8 @@ restartScanEntry: } if (!isColStore && entry->matchBitmap && !tbm_is_empty(entry->matchBitmap)) { - entry->matchIterator = tbm_begin_iterate(entry->matchBitmap); + TBMHandler tbm_handler = tbm_get_handler(entry->matchBitmap); + entry->matchIterator = tbm_handler._begin_iterate(entry->matchBitmap); entry->isFinished = false; } @@ -1569,6 +1572,7 @@ static void scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids) pendingPosition pos; Buffer metabuffer = ReadBuffer(scan->indexRelation, GIN_METAPAGE_BLKNO); BlockNumber blkno; + TBMHandler tbm_handler = tbm_get_handler(tbm); Oid partHeapOid = IndexScanGetPartHeapOid(scan); *ntids = 0; @@ -1628,7 +1632,7 @@ static void scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids) MemoryContextReset(so->tempCtx); if (match) { - tbm_add_tuples(tbm, &pos.item, 1, recheck, partHeapOid); + tbm_handler._add_tuples(tbm, &pos.item, 1, recheck, partHeapOid, InvalidBktId); (*ntids)++; } } @@ -1652,6 +1656,7 @@ Datum gingetbitmap(PG_FUNCTION_ARGS) int64 ntids; ItemPointerData iptr; bool recheck = false; + TBMHandler tbm_handler = tbm_get_handler(tbm); Oid partHeapOid = IndexScanGetPartHeapOid(scan); /* @@ -1691,9 +1696,9 @@ Datum gingetbitmap(PG_FUNCTION_ARGS) break; if (ItemPointerIsLossyPage(&iptr)) - tbm_add_page(tbm, ItemPointerGetBlockNumber(&iptr), partHeapOid); + tbm_handler._add_page(tbm, ItemPointerGetBlockNumber(&iptr), partHeapOid, InvalidBktId); else - tbm_add_tuples(tbm, &iptr, 1, recheck, partHeapOid); + tbm_handler._add_tuples(tbm, &iptr, 1, recheck, partHeapOid, InvalidBktId); ntids++; } diff --git a/src/gausskernel/storage/access/gin/ginpostinglist.cpp b/src/gausskernel/storage/access/gin/ginpostinglist.cpp index c83c67930..85a23be62 100644 --- a/src/gausskernel/storage/access/gin/ginpostinglist.cpp +++ b/src/gausskernel/storage/access/gin/ginpostinglist.cpp @@ -361,9 +361,10 @@ int ginPostingListDecodeAllSegmentsToTbm(GinPostingList *ptr, int len, TIDBitmap { int ndecoded; ItemPointer items; + TBMHandler tbm_handler = tbm_get_handler(tbm); items = ginPostingListDecodeAllSegments(ptr, len, &ndecoded); - tbm_add_tuples(tbm, items, ndecoded, false); + tbm_handler._add_tuples(tbm, items, ndecoded, false, InvalidOid, InvalidBktId); pfree(items); return ndecoded; } diff --git a/src/gausskernel/storage/access/gist/gistget.cpp b/src/gausskernel/storage/access/gist/gistget.cpp index 5b053894e..e837e45fb 100644 --- a/src/gausskernel/storage/access/gist/gistget.cpp +++ b/src/gausskernel/storage/access/gist/gistget.cpp @@ -288,11 +288,12 @@ static void gistScanPage(IndexScanDesc scan, const GISTSearchItem *pageItem, con continue; if (tbm && GistPageIsLeaf(page)) { + TBMHandler tbm_handler = tbm_get_handler(tbm); /* * getbitmap scan, so just push heap tuple TIDs into the bitmap * without worrying about ordering */ - tbm_add_tuples(tbm, &it->t_tid, 1, recheck, partHeapOid); + tbm_handler._add_tuples(tbm, &it->t_tid, 1, recheck, partHeapOid, InvalidBktId); (*ntids)++; } else if (scan->numberOfOrderBys == 0 && GistPageIsLeaf(page)) { /* diff --git a/src/gausskernel/storage/access/hash/hash.cpp b/src/gausskernel/storage/access/hash/hash.cpp index c124c9499..e7cfe74d7 100644 --- a/src/gausskernel/storage/access/hash/hash.cpp +++ b/src/gausskernel/storage/access/hash/hash.cpp @@ -357,8 +357,9 @@ Datum hashgetbitmap(PG_FUNCTION_ARGS) /* Save tuple ID, and continue scanning */ if (add_tuple) { + TBMHandler tbm_handler = tbm_get_handler(tbm); /* Note we mark the tuple ID as requiring recheck */ - tbm_add_tuples(tbm, &(so->hashso_heappos), 1, true, partHeapOid); + tbm_handler._add_tuples(tbm, &(so->hashso_heappos), 1, true, partHeapOid, InvalidBktId); ntids++; } diff --git a/src/gausskernel/storage/access/nbtree/nbtree.cpp b/src/gausskernel/storage/access/nbtree/nbtree.cpp index 5f409dd74..083d68b41 100644 --- a/src/gausskernel/storage/access/nbtree/nbtree.cpp +++ b/src/gausskernel/storage/access/nbtree/nbtree.cpp @@ -320,6 +320,9 @@ int64 btgetbitmap_internal(IndexScanDesc scan, TIDBitmap *tbm) BTScanOpaque so = (BTScanOpaque)scan->opaque; int64 ntids = 0; ItemPointer heapTid; + Oid currPartOid; + int2 bucketid; + TBMHandler tbm_handler = tbm_get_handler(tbm); /* * If we have any array keys, initialize them. @@ -339,9 +342,9 @@ int64 btgetbitmap_internal(IndexScanDesc scan, TIDBitmap *tbm) if (_bt_first(scan, ForwardScanDirection)) { /* Save tuple ID, and continue scanning */ heapTid = &scan->xs_ctup.t_self; - Oid currPartOid = so->currPos.items[so->currPos.itemIndex].partitionOid; - int2 bucketid = so->currPos.items[so->currPos.itemIndex].bucketid; - tbm_add_tuples(tbm, heapTid, 1, false, currPartOid, bucketid); + currPartOid = so->currPos.items[so->currPos.itemIndex].partitionOid; + bucketid = so->currPos.items[so->currPos.itemIndex].bucketid; + tbm_handler._add_tuples(tbm, heapTid, 1, false, currPartOid, bucketid); ntids++; for (;;) { @@ -360,7 +363,7 @@ int64 btgetbitmap_internal(IndexScanDesc scan, TIDBitmap *tbm) heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid; currPartOid = so->currPos.items[so->currPos.itemIndex].partitionOid; bucketid = so->currPos.items[so->currPos.itemIndex].bucketid; - tbm_add_tuples(tbm, heapTid, 1, false, currPartOid, bucketid); + tbm_handler._add_tuples(tbm, heapTid, 1, false, currPartOid, bucketid); ntids++; } } diff --git a/src/gausskernel/storage/access/spgist/spgscan.cpp b/src/gausskernel/storage/access/spgist/spgscan.cpp index 962201d3c..d58b2f2d0 100644 --- a/src/gausskernel/storage/access/spgist/spgscan.cpp +++ b/src/gausskernel/storage/access/spgist/spgscan.cpp @@ -489,8 +489,9 @@ static void spgWalk(Relation index, SpGistScanOpaque so, bool scanWholeIndex, st /* storeRes subroutine for getbitmap case */ static void storeBitmap(SpGistScanOpaque so, ItemPointer heapPtr, Datum leafValue, bool isnull, bool recheck) -{ - tbm_add_tuples(so->tbm, heapPtr, 1, recheck, so->partHeapOid); +{ + TBMHandler tbm_handler = tbm_get_handler(so->tbm); + tbm_handler._add_tuples(so->tbm, heapPtr, 1, recheck, so->partHeapOid, InvalidBktId); so->ntids++; } diff --git a/src/gausskernel/storage/access/ubtree/ubtree.cpp b/src/gausskernel/storage/access/ubtree/ubtree.cpp index af1e8f85d..196f01e4e 100644 --- a/src/gausskernel/storage/access/ubtree/ubtree.cpp +++ b/src/gausskernel/storage/access/ubtree/ubtree.cpp @@ -291,6 +291,8 @@ Datum ubtgetbitmap(PG_FUNCTION_ARGS) BTScanOpaque so = (BTScanOpaque)scan->opaque; int64 ntids = 0; ItemPointer heapTid; + Oid currPartOid; + TBMHandler tbm_handler = tbm_get_handler(tbm); WHITEBOX_TEST_STUB("ubtgetbitmap", WhiteboxDefaultErrorEmit); @@ -312,8 +314,8 @@ Datum ubtgetbitmap(PG_FUNCTION_ARGS) if (UBTreeFirst(scan, ForwardScanDirection)) { /* Save tuple ID, and continue scanning */ heapTid = &scan->xs_ctup.t_self; - Oid currPartOid = so->currPos.items[so->currPos.itemIndex].partitionOid; - tbm_add_tuples(tbm, heapTid, 1, scan->xs_recheck_itup, currPartOid); + currPartOid = so->currPos.items[so->currPos.itemIndex].partitionOid; + tbm_handler._add_tuples(tbm, heapTid, 1, scan->xs_recheck_itup, currPartOid, InvalidBktId); ntids++; for (;;) { @@ -331,7 +333,7 @@ Datum ubtgetbitmap(PG_FUNCTION_ARGS) /* Save tuple ID, and continue scanning */ heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid; currPartOid = so->currPos.items[so->currPos.itemIndex].partitionOid; - tbm_add_tuples(tbm, heapTid, 1, scan->xs_recheck_itup, currPartOid); + tbm_handler._add_tuples(tbm, heapTid, 1, scan->xs_recheck_itup, currPartOid, InvalidBktId); ntids++; } } diff --git a/src/include/lib/simplehash.h b/src/include/lib/simplehash.h new file mode 100644 index 000000000..946d7e44d --- /dev/null +++ b/src/include/lib/simplehash.h @@ -0,0 +1,1141 @@ +/* + * simplehash.h + * + * When included this file generates a "templated" (by way of macros) + * open-addressing hash table implementation specialized to user-defined + * types. + * + * It's probably not worthwhile to generate such a specialized implementation + * for hash tables that aren't performance or space sensitive. + * + * Compared to dynahash, simplehash has the following benefits: + * + * - Due to the "templated" code generation has known structure sizes and no + * indirect function calls (which show up substantially in dynahash + * profiles). These features considerably increase speed for small + * entries. + * - Open addressing has better CPU cache behavior than dynahash's chained + * hashtables. + * - The generated interface is type-safe and easier to use than dynahash, + * though at the cost of more complex setup. + * - Allocates memory in a MemoryContext or another allocator with a + * malloc/free style interface (which isn't easily usable in a shared + * memory context) + * - Does not require the overhead of a separate memory context. + * + * Usage notes: + * + * To generate a hash-table and associated functions for a use case several + * macros have to be #define'ed before this file is included. Including + * the file #undef's all those, so a new hash table can be generated + * afterwards. + * The relevant parameters are: + * - SH_PREFIX - prefix for all symbol names generated. A prefix of 'foo' + * will result in hash table type 'foo_hash' and functions like + * 'foo_insert'/'foo_lookup' and so forth. + * - SH_ELEMENT_TYPE - type of the contained elements + * - SH_KEY_TYPE - type of the hashtable's key + * - SH_DECLARE - if defined function prototypes and type declarations are + * generated + * - SH_DEFINE - if defined function definitions are generated + * - SH_SCOPE - in which scope (e.g. extern, static inline) do function + * declarations reside + * - SH_RAW_ALLOCATOR - if defined, memory contexts are not used; instead, + * use this to allocate bytes. The allocator must zero the returned space. + * - SH_USE_NONDEFAULT_ALLOCATOR - if defined no element allocator functions + * are defined, so you can supply your own + * The following parameters are only relevant when SH_DEFINE is defined: + * - SH_KEY - name of the element in SH_ELEMENT_TYPE containing the hash key + * - SH_EQUAL(table, a, b) - compare two table keys + * - SH_HASH_KEY(table, key) - generate hash for the key + * - SH_STORE_HASH - if defined the hash is stored in the elements + * - SH_GET_HASH(tb, a) - return the field to store the hash in + * + * The element type is required to contain a "status" member that can store + * the range of values defined in the SH_STATUS enum. + * + * While SH_STORE_HASH (and subsequently SH_GET_HASH) are optional, because + * the hash table implementation needs to compare hashes to move elements + * (particularly when growing the hash), it's preferable, if possible, to + * store the element's hash in the element's data type. If the hash is so + * stored, the hash table will also compare hashes before calling SH_EQUAL + * when comparing two keys. + * + * For convenience the hash table create functions accept a void pointer + * that will be stored in the hash table type's member private_data. This + * allows callbacks to reference caller provided data. + * + * For examples of usage look at tidbitmap.c (file local definition) and + * execnodes.h/execGrouping.c (exposed declaration, file local + * implementation). + * + * Hash table design: + * + * The hash table design chosen is a variant of linear open-addressing. The + * reason for doing so is that linear addressing is CPU cache & pipeline + * friendly. The biggest disadvantage of simple linear addressing schemes + * are highly variable lookup times due to clustering, and deletions + * leaving a lot of tombstones around. To address these issues a variant + * of "robin hood" hashing is employed. Robin hood hashing optimizes + * chaining lengths by moving elements close to their optimal bucket + * ("rich" elements), out of the way if a to-be-inserted element is further + * away from its optimal position (i.e. it's "poor"). While that can make + * insertions slower, the average lookup performance is a lot better, and + * higher fill factors can be used in a still performant manner. To avoid + * tombstones - which normally solve the issue that a deleted node's + * presence is relevant to determine whether a lookup needs to continue + * looking or is done - buckets following a deleted element are shifted + * backwards, unless they're empty or already at their optimal position. + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/lib/simplehash.h + */ + +#include "port/pg_bitutils.h" + +/* helpers */ +#define SH_MAKE_PREFIX(a) CppConcat(a, _) +#define SH_MAKE_NAME(name) SH_MAKE_NAME_(SH_MAKE_PREFIX(SH_PREFIX), name) +#define SH_MAKE_NAME_(a, b) CppConcat(a, b) + +/* name macros for: */ + +/* type declarations */ +#define SH_TYPE SH_MAKE_NAME(hash) +#define SH_STATUS SH_MAKE_NAME(status) +#define SH_STATUS_EMPTY SH_MAKE_NAME(SH_EMPTY) +#define SH_STATUS_IN_USE SH_MAKE_NAME(SH_IN_USE) +#define SH_ITERATOR SH_MAKE_NAME(iterator) + +/* function declarations */ +#define SH_CREATE SH_MAKE_NAME(create) +#define SH_DESTROY SH_MAKE_NAME(destroy) +#define SH_RESET SH_MAKE_NAME(reset) +#define SH_INSERT SH_MAKE_NAME(insert) +#define SH_INSERT_HASH SH_MAKE_NAME(insert_hash) +#define SH_DELETE_ITEM SH_MAKE_NAME(delete_item) +#define SH_DELETE SH_MAKE_NAME(delete) +#define SH_LOOKUP SH_MAKE_NAME(lookup) +#define SH_LOOKUP_HASH SH_MAKE_NAME(lookup_hash) +#define SH_GROW SH_MAKE_NAME(grow) +#define SH_START_ITERATE SH_MAKE_NAME(start_iterate) +#define SH_START_ITERATE_AT SH_MAKE_NAME(start_iterate_at) +#define SH_ITERATE SH_MAKE_NAME(iterate) +#define SH_ALLOCATE SH_MAKE_NAME(allocate) +#define SH_FREE SH_MAKE_NAME(free) +#define SH_STAT SH_MAKE_NAME(stat) + +/* internal helper functions (no externally visible prototypes) */ +#define SH_COMPUTE_PARAMETERS SH_MAKE_NAME(compute_parameters) +#define SH_NEXT SH_MAKE_NAME(next) +#define SH_PREV SH_MAKE_NAME(prev) +#define SH_DISTANCE_FROM_OPTIMAL SH_MAKE_NAME(distance) +#define SH_INITIAL_BUCKET SH_MAKE_NAME(initial_bucket) +#define SH_ENTRY_HASH SH_MAKE_NAME(entry_hash) +#define SH_INSERT_HASH_INTERNAL SH_MAKE_NAME(insert_hash_internal) +#define SH_LOOKUP_HASH_INTERNAL SH_MAKE_NAME(lookup_hash_internal) + +/* generate forward declarations necessary to use the hash table */ +#ifdef SH_DECLARE + +/* type definitions */ +typedef struct SH_TYPE { + /* + * Size of data / bucket array, 64 bits to handle UINT32_MAX sized hash + * tables. Note that the maximum number of elements is lower + * (SH_MAX_FILLFACTOR) + */ + uint64 size; + + /* how many elements have valid contents */ + uint32 members; + + /* mask for bucket and size calculations, based on size */ + uint32 sizemask; + + /* boundary after which to grow hashtable */ + uint32 grow_threshold; + + /* hash buckets */ + SH_ELEMENT_TYPE *data; + +#ifndef SH_RAW_ALLOCATOR + /* memory context to use for allocations */ + MemoryContext ctx; +#endif + + /* user defined data, useful for callbacks */ + void *private_data; +} SH_TYPE; + +typedef enum SH_STATUS { SH_STATUS_EMPTY = 0x00, SH_STATUS_IN_USE = 0x01 } SH_STATUS; + +typedef struct SH_ITERATOR { + uint32 cur; /* current element */ + uint32 end; + bool done; /* iterator exhausted? */ +} SH_ITERATOR; + +/* externally visible function prototypes */ +#ifdef SH_RAW_ALLOCATOR +/* _hash _create(uint32 nelements, void *private_data) */ +SH_SCOPE SH_TYPE *SH_CREATE(uint32 nelements, void *private_data); +#else +/* + * _hash _create(MemoryContext ctx, uint32 nelements, + * void *private_data) + */ +SH_SCOPE SH_TYPE *SH_CREATE(MemoryContext ctx, uint32 nelements, void *private_data); +#endif + +/* void _destroy(_hash *tb) */ +SH_SCOPE void SH_DESTROY(SH_TYPE *tb); + +/* void _reset(_hash *tb) */ +SH_SCOPE void SH_RESET(SH_TYPE *tb); + +/* void _grow(_hash *tb, uint64 newsize) */ +SH_SCOPE void SH_GROW(SH_TYPE *tb, uint64 newsize); + +/* *_insert(_hash *tb, key, bool *found) */ +SH_SCOPE SH_ELEMENT_TYPE *SH_INSERT(SH_TYPE *tb, SH_KEY_TYPE key, bool *found); + +/* + * *_insert_hash(_hash *tb, key, uint32 hash, + * bool *found) + */ +SH_SCOPE SH_ELEMENT_TYPE *SH_INSERT_HASH(SH_TYPE *tb, SH_KEY_TYPE key, uint32 hash, bool *found); + +/* *_lookup(_hash *tb, key) */ +SH_SCOPE SH_ELEMENT_TYPE *SH_LOOKUP(SH_TYPE *tb, SH_KEY_TYPE key); + +/* *_lookup_hash(_hash *tb, key, uint32 hash) */ +SH_SCOPE SH_ELEMENT_TYPE *SH_LOOKUP_HASH(SH_TYPE *tb, SH_KEY_TYPE key, uint32 hash); + +/* void _delete_item(_hash *tb, *entry) */ +SH_SCOPE void SH_DELETE_ITEM(SH_TYPE *tb, SH_ELEMENT_TYPE *entry); + +/* bool _delete(_hash *tb, key) */ +SH_SCOPE bool SH_DELETE(SH_TYPE *tb, SH_KEY_TYPE key); + +/* void _start_iterate(_hash *tb, _iterator *iter) */ +SH_SCOPE void SH_START_ITERATE(SH_TYPE *tb, SH_ITERATOR *iter); + +/* + * void _start_iterate_at(_hash *tb, _iterator *iter, + * uint32 at) + */ +SH_SCOPE void SH_START_ITERATE_AT(SH_TYPE *tb, SH_ITERATOR *iter, uint32 at); + +/* *_iterate(_hash *tb, _iterator *iter) */ +SH_SCOPE SH_ELEMENT_TYPE *SH_ITERATE(SH_TYPE *tb, SH_ITERATOR *iter); + +/* void _stat(_hash *tb */ +SH_SCOPE void SH_STAT(SH_TYPE *tb); + +#endif /* SH_DECLARE */ + +/* generate implementation of the hash table */ +#ifdef SH_DEFINE + +#ifndef SH_RAW_ALLOCATOR +#include "utils/memutils.h" +#endif + +/* max data array size,we allow up to PG_UINT32_MAX buckets, including 0 */ +#define SH_MAX_SIZE (((uint64)PG_UINT32_MAX) + 1) + +/* normal fillfactor, unless already close to maximum */ +#ifndef SH_FILLFACTOR +#define SH_FILLFACTOR (0.9) +#endif +/* increase fillfactor if we otherwise would error out */ +#define SH_MAX_FILLFACTOR (0.98) +/* grow if actual and optimal location bigger than */ +#ifndef SH_GROW_MAX_DIB +#define SH_GROW_MAX_DIB 25 +#endif +/* grow if more than elements to move when inserting */ +#ifndef SH_GROW_MAX_MOVE +#define SH_GROW_MAX_MOVE 150 +#endif +#ifndef SH_GROW_MIN_FILLFACTOR +/* but do not grow due to SH_GROW_MAX_* if below */ +#define SH_GROW_MIN_FILLFACTOR 0.1 +#endif + +#ifdef SH_STORE_HASH +#define SH_COMPARE_KEYS(tb, ahash, akey, b) (ahash == SH_GET_HASH(tb, b) && SH_EQUAL(tb, b->SH_KEY, akey)) +#else +#define SH_COMPARE_KEYS(tb, ahash, akey, b) (SH_EQUAL(tb, b->SH_KEY, akey)) +#endif + +/* + * Wrap the following definitions in include guards, to avoid multiple + * definition errors if this header is included more than once. The rest of + * the file deliberately has no include guards, because it can be included + * with different parameters to define functions and types with non-colliding + * names. + */ +#ifndef SIMPLEHASH_H +#define SIMPLEHASH_H + +#ifdef FRONTEND +#define sh_error(...) \ + do { \ + pg_log_fatal(__VA_ARGS__); \ + exit(1); \ + } while (0) +#define sh_log(...) pg_log_info(__VA_ARGS__) +#else +#define sh_error(...) elog(ERROR, __VA_ARGS__) +#define sh_log(...) elog(LOG, __VA_ARGS__) +#endif + +#endif + +/* calculate ceil(log base 2) of num */ +static inline uint64 sh_log2(uint64 num) +{ + int i; + uint64 limit; + + for (i = 0, limit = 1; limit < num; i++, limit <<= 1) + ; + return i; +} + +/* calculate first power of 2 >= num */ +static inline uint64 sh_pow2(uint64 num) +{ + return ((uint64)1) << sh_log2(num); +} + +/* + * Compute sizing parameters for hashtable. Called when creating and growing + * the hashtable. + */ +static inline void SH_COMPUTE_PARAMETERS(SH_TYPE *tb, uint64 newsize) +{ + uint64 size; + + /* supporting zero sized hashes would complicate matters */ + size = Max(newsize, 2); + + /* round up size to the next power of 2, that's how bucketing works */ + size = sh_pow2(size); + Assert(size <= SH_MAX_SIZE); + + /* + * Verify that allocation of ->data is possible on this platform, without + * overflowing Size. + */ + if (unlikely((((uint64)sizeof(SH_ELEMENT_TYPE)) * size) >= SIZE_MAX / 2)) + sh_error("hash table too large"); + + /* now set size */ + tb->size = size; + tb->sizemask = (uint32)(size - 1); + + /* + * Compute the next threshold at which we need to grow the hash table + * again. + */ + if (tb->size == SH_MAX_SIZE) + tb->grow_threshold = ((double)tb->size) * SH_MAX_FILLFACTOR; + else + tb->grow_threshold = ((double)tb->size) * SH_FILLFACTOR; +} + +/* return the optimal bucket for the hash */ +static inline uint32 SH_INITIAL_BUCKET(SH_TYPE *tb, uint32 hash) +{ + return hash & tb->sizemask; +} + +/* return next bucket after the current, handling wraparound */ +static inline uint32 SH_NEXT(SH_TYPE *tb, uint32 curelem, uint32 startelem) +{ + curelem = (curelem + 1) & tb->sizemask; + + Assert(curelem != startelem); + + return curelem; +} + +/* return bucket before the current, handling wraparound */ +static inline uint32 SH_PREV(SH_TYPE *tb, uint32 curelem, uint32 startelem) +{ + curelem = (curelem - 1) & tb->sizemask; + + Assert(curelem != startelem); + + return curelem; +} + +/* return distance between bucket and its optimal position */ +static inline uint32 SH_DISTANCE_FROM_OPTIMAL(SH_TYPE *tb, uint32 optimal, uint32 bucket) +{ + if (optimal <= bucket) + return bucket - optimal; + else + return (tb->size + bucket) - optimal; +} + +static inline uint32 SH_ENTRY_HASH(SH_TYPE *tb, SH_ELEMENT_TYPE *entry) +{ +#ifdef SH_STORE_HASH + return SH_GET_HASH(tb, entry); +#else + return SH_HASH_KEY(tb, entry->SH_KEY); +#endif +} + +/* default memory allocator function */ +static inline void *SH_ALLOCATE(SH_TYPE *type, Size size); +static inline void SH_FREE(SH_TYPE *type, void *pointer); + +#ifndef SH_USE_NONDEFAULT_ALLOCATOR + +/* default memory allocator function */ +static inline void *SH_ALLOCATE(SH_TYPE *type, Size size) +{ +#ifdef SH_RAW_ALLOCATOR + return SH_RAW_ALLOCATOR(size); +#else + return MemoryContextAllocExtended(type->ctx, size, MCXT_ALLOC_HUGE | MCXT_ALLOC_ZERO); +#endif +} + +/* default memory free function */ +static inline void SH_FREE(SH_TYPE *type, void *pointer) +{ + pfree_ext(pointer); +} + +#endif + +/* + * Create a hash table with enough space for `nelements` distinct members. + * Memory for the hash table is allocated from the passed-in context. If + * desired, the array of elements can be allocated using a passed-in allocator; + * this could be useful in order to place the array of elements in a shared + * memory, or in a context that will outlive the rest of the hash table. + * Memory other than for the array of elements will still be allocated from + * the passed-in context. + */ +#ifdef SH_RAW_ALLOCATOR +SH_SCOPE SH_TYPE *SH_CREATE(uint32 nelements, void *private_data) +#else +SH_SCOPE SH_TYPE *SH_CREATE(MemoryContext ctx, uint32 nelements, void *private_data) +#endif +{ + SH_TYPE *tb; + uint64 size; + +#ifdef SH_RAW_ALLOCATOR + tb = (SH_TYPE *)SH_RAW_ALLOCATOR(sizeof(SH_TYPE)); +#else + tb = (SH_TYPE *)MemoryContextAllocZero(ctx, sizeof(SH_TYPE)); + tb->ctx = ctx; +#endif + tb->private_data = private_data; + + /* increase nelements by fillfactor, want to store nelements elements */ + size = Min((double)SH_MAX_SIZE, ((double)nelements) / SH_FILLFACTOR); + + SH_COMPUTE_PARAMETERS(tb, size); + + tb->data = (SH_ELEMENT_TYPE *)SH_ALLOCATE(tb, sizeof(SH_ELEMENT_TYPE) * tb->size); + + return tb; +} + +/* destroy a previously created hash table */ +SH_SCOPE void SH_DESTROY(SH_TYPE *tb) +{ + SH_FREE(tb, tb->data); + pfree(tb); +} + +/* reset the contents of a previously created hash table */ +SH_SCOPE void SH_RESET(SH_TYPE *tb) +{ + errno_t rc = EOK; + rc = memset_s(tb->data, sizeof(SH_ELEMENT_TYPE) * tb->size, 0, sizeof(SH_ELEMENT_TYPE) * tb->size); + securec_check(rc, "\0", "\0"); + tb->members = 0; +} + +/* + * Grow a hash table to at least `newsize` buckets. + * + * Usually this will automatically be called by insertions/deletions, when + * necessary. But resizing to the exact input size can be advantageous + * performance-wise, when known at some point. + */ +SH_SCOPE void SH_GROW(SH_TYPE *tb, uint64 newsize) +{ + uint64 oldsize = tb->size; + SH_ELEMENT_TYPE *olddata = tb->data; + SH_ELEMENT_TYPE *newdata; + uint32 i; + uint32 startelem = 0; + uint32 copyelem; + + Assert(oldsize == sh_pow2(oldsize)); + Assert(oldsize != SH_MAX_SIZE); + Assert(oldsize < newsize); + + /* compute parameters for new table */ + SH_COMPUTE_PARAMETERS(tb, newsize); + + tb->data = (SH_ELEMENT_TYPE *)SH_ALLOCATE(tb, sizeof(SH_ELEMENT_TYPE) * tb->size); + + newdata = tb->data; + + /* + * Copy entries from the old data to newdata. We theoretically could use + * SH_INSERT here, to avoid code duplication, but that's more general than + * we need. We neither want tb->members increased, nor do we need to do + * deal with deleted elements, nor do we need to compare keys. So a + * special-cased implementation is lot faster. As resizing can be time + * consuming and frequent, that's worthwhile to optimize. + * + * To be able to simply move entries over, we have to start not at the + * first bucket (i.e olddata[0]), but find the first bucket that's either + * empty, or is occupied by an entry at its optimal position. Such a + * bucket has to exist in any table with a load factor under 1, as not all + * buckets are occupied, i.e. there always has to be an empty bucket. By + * starting at such a bucket we can move the entries to the larger table, + * without having to deal with conflicts. + */ + + /* search for the first element in the hash that's not wrapped around */ + for (i = 0; i < oldsize; i++) { + SH_ELEMENT_TYPE *oldentry = &olddata[i]; + uint32 hash; + uint32 optimal; + + if (oldentry->status != SH_STATUS_IN_USE) { + startelem = i; + break; + } + + hash = SH_ENTRY_HASH(tb, oldentry); + optimal = SH_INITIAL_BUCKET(tb, hash); + + if (optimal == i) { + startelem = i; + break; + } + } + + /* and copy all elements in the old table */ + copyelem = startelem; + for (i = 0; i < oldsize; i++) { + SH_ELEMENT_TYPE *oldentry = &olddata[copyelem]; + errno_t rc = EOK; + + if (oldentry->status == SH_STATUS_IN_USE) { + uint32 hash; + uint32 startelem; + uint32 curelem; + SH_ELEMENT_TYPE *newentry; + + hash = SH_ENTRY_HASH(tb, oldentry); + startelem = SH_INITIAL_BUCKET(tb, hash); + curelem = startelem; + + /* find empty element to put data into */ + while (true) { + newentry = &newdata[curelem]; + + if (newentry->status == SH_STATUS_EMPTY) { + break; + } + + curelem = SH_NEXT(tb, curelem, startelem); + } + + /* copy entry to new slot */ + rc = memcpy_s(newentry, sizeof(SH_ELEMENT_TYPE), oldentry, sizeof(SH_ELEMENT_TYPE)); + securec_check(rc, "\0", "\0"); + } + + /* can't use SH_NEXT here, would use new size */ + copyelem++; + if (copyelem >= oldsize) { + copyelem = 0; + } + } + + SH_FREE(tb, olddata); +} + +/* + * This is a separate static inline function, so it can be reliably be inlined + * into its wrapper functions even if SH_SCOPE is extern. + */ +static inline SH_ELEMENT_TYPE *SH_INSERT_HASH_INTERNAL(SH_TYPE *tb, SH_KEY_TYPE key, uint32 hash, bool *found) +{ + uint32 startelem; + uint32 curelem; + SH_ELEMENT_TYPE *data; + uint32 insertdist; + +restart: + insertdist = 0; + + /* + * We do the grow check even if the key is actually present, to avoid + * doing the check inside the loop. This also lets us avoid having to + * re-find our position in the hashtable after resizing. + * + * Note that this also reached when resizing the table due to + * SH_GROW_MAX_DIB / SH_GROW_MAX_MOVE. + */ + if (unlikely(tb->members >= tb->grow_threshold)) { + if (unlikely(tb->size == SH_MAX_SIZE)) + sh_error("hash table size exceeded"); + + /* + * When optimizing, it can be very useful to print these out. + */ + /* SH_STAT(tb); */ + SH_GROW(tb, tb->size * 2); + /* SH_STAT(tb); */ + } + + /* perform insert, start bucket search at optimal location */ + data = tb->data; + startelem = SH_INITIAL_BUCKET(tb, hash); + curelem = startelem; + while (true) { + uint32 curdist; + uint32 curhash; + uint32 curoptimal; + SH_ELEMENT_TYPE *entry = &data[curelem]; + + /* any empty bucket can directly be used */ + if (entry->status == SH_STATUS_EMPTY) { + tb->members++; + entry->SH_KEY = key; +#ifdef SH_STORE_HASH + SH_GET_HASH(tb, entry) = hash; +#endif + entry->status = SH_STATUS_IN_USE; + *found = false; + return entry; + } + + /* + * If the bucket is not empty, we either found a match (in which case + * we're done), or we have to decide whether to skip over or move the + * colliding entry. When the colliding element's distance to its + * optimal position is smaller than the to-be-inserted entry's, we + * shift the colliding entry (and its followers) forward by one. + */ + + if (SH_COMPARE_KEYS(tb, hash, key, entry)) { + Assert(entry->status == SH_STATUS_IN_USE); + *found = true; + return entry; + } + + curhash = SH_ENTRY_HASH(tb, entry); + curoptimal = SH_INITIAL_BUCKET(tb, curhash); + curdist = SH_DISTANCE_FROM_OPTIMAL(tb, curoptimal, curelem); + + if (insertdist > curdist) { + SH_ELEMENT_TYPE *lastentry = entry; + uint32 emptyelem = curelem; + uint32 moveelem; + int32 emptydist = 0; + errno_t rc = EOK; + + /* find next empty bucket */ + while (true) { + SH_ELEMENT_TYPE *emptyentry; + + emptyelem = SH_NEXT(tb, emptyelem, startelem); + emptyentry = &data[emptyelem]; + + if (emptyentry->status == SH_STATUS_EMPTY) { + lastentry = emptyentry; + break; + } + + /* + * To avoid negative consequences from overly imbalanced + * hashtables, grow the hashtable if collisions would require + * us to move a lot of entries. The most likely cause of such + * imbalance is filling a (currently) small table, from a + * currently big one, in hash-table order. Don't grow if the + * hashtable would be too empty, to prevent quick space + * explosion for some weird edge cases. + */ + if (unlikely(++emptydist > SH_GROW_MAX_MOVE) && + ((double)tb->members / tb->size) >= SH_GROW_MIN_FILLFACTOR) { + tb->grow_threshold = 0; + goto restart; + } + } + + /* shift forward, starting at last occupied element */ + + /* + * TODO: This could be optimized to be one memcpy in many cases, + * excepting wrapping around at the end of ->data. Hasn't shown up + * in profiles so far though. + */ + moveelem = emptyelem; + while (moveelem != curelem) { + SH_ELEMENT_TYPE *moveentry; + + moveelem = SH_PREV(tb, moveelem, startelem); + moveentry = &data[moveelem]; + + + rc = memcpy_s(lastentry, sizeof(SH_ELEMENT_TYPE), moveentry, sizeof(SH_ELEMENT_TYPE)); + securec_check(rc, "\0", "\0"); + lastentry = moveentry; + } + + /* and fill the now empty spot */ + tb->members++; + + entry->SH_KEY = key; +#ifdef SH_STORE_HASH + SH_GET_HASH(tb, entry) = hash; +#endif + entry->status = SH_STATUS_IN_USE; + *found = false; + return entry; + } + + curelem = SH_NEXT(tb, curelem, startelem); + insertdist++; + + /* + * To avoid negative consequences from overly imbalanced hashtables, + * grow the hashtable if collisions lead to large runs. The most + * likely cause of such imbalance is filling a (currently) small + * table, from a currently big one, in hash-table order. Don't grow + * if the hashtable would be too empty, to prevent quick space + * explosion for some weird edge cases. + */ + if (unlikely(insertdist > SH_GROW_MAX_DIB) && ((double)tb->members / tb->size) >= SH_GROW_MIN_FILLFACTOR) { + tb->grow_threshold = 0; + goto restart; + } + } +} + +/* + * Insert the key key into the hash-table, set *found to true if the key + * already exists, false otherwise. Returns the hash-table entry in either + * case. + */ +SH_SCOPE SH_ELEMENT_TYPE *SH_INSERT(SH_TYPE *tb, SH_KEY_TYPE key, bool *found) +{ + uint32 hash = SH_HASH_KEY(tb, key); + + return SH_INSERT_HASH_INTERNAL(tb, key, hash, found); +} + +/* + * Insert the key key into the hash-table using an already-calculated + * hash. Set *found to true if the key already exists, false + * otherwise. Returns the hash-table entry in either case. + */ +SH_SCOPE SH_ELEMENT_TYPE *SH_INSERT_HASH(SH_TYPE *tb, SH_KEY_TYPE key, uint32 hash, bool *found) +{ + return SH_INSERT_HASH_INTERNAL(tb, key, hash, found); +} + +/* + * This is a separate static inline function, so it can be reliably be inlined + * into its wrapper functions even if SH_SCOPE is extern. + */ +static inline SH_ELEMENT_TYPE *SH_LOOKUP_HASH_INTERNAL(SH_TYPE *tb, SH_KEY_TYPE key, uint32 hash) +{ + const uint32 startelem = SH_INITIAL_BUCKET(tb, hash); + uint32 curelem = startelem; + + while (true) { + SH_ELEMENT_TYPE *entry = &tb->data[curelem]; + + if (entry->status == SH_STATUS_EMPTY) { + return NULL; + } + + Assert(entry->status == SH_STATUS_IN_USE); + + if (SH_COMPARE_KEYS(tb, hash, key, entry)) + return entry; + + /* + * TODO: we could stop search based on distance. If the current + * buckets's distance-from-optimal is smaller than what we've skipped + * already, the entry doesn't exist. Probably only do so if + * SH_STORE_HASH is defined, to avoid re-computing hashes? + */ + + curelem = SH_NEXT(tb, curelem, startelem); + } +} + +/* + * Lookup up entry in hash table. Returns NULL if key not present. + */ +SH_SCOPE SH_ELEMENT_TYPE *SH_LOOKUP(SH_TYPE *tb, SH_KEY_TYPE key) +{ + uint32 hash = SH_HASH_KEY(tb, key); + + return SH_LOOKUP_HASH_INTERNAL(tb, key, hash); +} + +/* + * Lookup up entry in hash table using an already-calculated hash. + * + * Returns NULL if key not present. + */ +SH_SCOPE SH_ELEMENT_TYPE *SH_LOOKUP_HASH(SH_TYPE *tb, SH_KEY_TYPE key, uint32 hash) +{ + return SH_LOOKUP_HASH_INTERNAL(tb, key, hash); +} + +/* + * Delete entry from hash table by key. Returns whether to-be-deleted key was + * present. + */ +SH_SCOPE bool SH_DELETE(SH_TYPE *tb, SH_KEY_TYPE key) +{ + uint32 hash = SH_HASH_KEY(tb, key); + uint32 startelem = SH_INITIAL_BUCKET(tb, hash); + uint32 curelem = startelem; + + while (true) { + SH_ELEMENT_TYPE *entry = &tb->data[curelem]; + + if (entry->status == SH_STATUS_EMPTY) + return false; + + if (entry->status == SH_STATUS_IN_USE && SH_COMPARE_KEYS(tb, hash, key, entry)) { + SH_ELEMENT_TYPE *lastentry = entry; + errno_t rc = EOK; + + tb->members--; + + /* + * Backward shift following elements till either an empty element + * or an element at its optimal position is encountered. + * + * While that sounds expensive, the average chain length is short, + * and deletions would otherwise require tombstones. + */ + while (true) { + SH_ELEMENT_TYPE *curentry; + uint32 curhash; + uint32 curoptimal; + + curelem = SH_NEXT(tb, curelem, startelem); + curentry = &tb->data[curelem]; + + if (curentry->status != SH_STATUS_IN_USE) { + lastentry->status = SH_STATUS_EMPTY; + break; + } + + curhash = SH_ENTRY_HASH(tb, curentry); + curoptimal = SH_INITIAL_BUCKET(tb, curhash); + + /* current is at optimal position, done */ + if (curoptimal == curelem) { + lastentry->status = SH_STATUS_EMPTY; + break; + } + + /* shift */ + rc = memcpy_s(lastentry, sizeof(SH_ELEMENT_TYPE), curentry, sizeof(SH_ELEMENT_TYPE)); + securec_check(rc, "\0", "\0"); + + lastentry = curentry; + } + + return true; + } + + /* TODO: return false; if distance too big */ + + curelem = SH_NEXT(tb, curelem, startelem); + } +} + +/* + * Delete entry from hash table by entry pointer + */ +SH_SCOPE void SH_DELETE_ITEM(SH_TYPE *tb, SH_ELEMENT_TYPE *entry) +{ + SH_ELEMENT_TYPE *lastentry = entry; + uint32 hash = SH_ENTRY_HASH(tb, entry); + uint32 startelem = SH_INITIAL_BUCKET(tb, hash); + uint32 curelem; + errno_t rc = EOK; + + /* Calculate the index of 'entry' */ + curelem = entry - &tb->data[0]; + + tb->members--; + + /* + * Backward shift following elements till either an empty element or an + * element at its optimal position is encountered. + * + * While that sounds expensive, the average chain length is short, and + * deletions would otherwise require tombstones. + */ + while (true) { + SH_ELEMENT_TYPE *curentry; + uint32 curhash; + uint32 curoptimal; + + curelem = SH_NEXT(tb, curelem, startelem); + curentry = &tb->data[curelem]; + + if (curentry->status != SH_STATUS_IN_USE) { + lastentry->status = SH_STATUS_EMPTY; + break; + } + + curhash = SH_ENTRY_HASH(tb, curentry); + curoptimal = SH_INITIAL_BUCKET(tb, curhash); + + /* current is at optimal position, done */ + if (curoptimal == curelem) { + lastentry->status = SH_STATUS_EMPTY; + break; + } + + /* shift */ + rc = memcpy_s(lastentry, sizeof(SH_ELEMENT_TYPE), curentry, sizeof(SH_ELEMENT_TYPE)); + securec_check(rc, "\0", "\0"); + + lastentry = curentry; + } +} + +/* + * Initialize iterator. + */ +SH_SCOPE void SH_START_ITERATE(SH_TYPE *tb, SH_ITERATOR *iter) +{ + int i; + uint64 startelem = PG_UINT64_MAX; + + /* + * Search for the first empty element. As deletions during iterations are + * supported, we want to start/end at an element that cannot be affected + * by elements being shifted. + */ + for (i = 0; i < tb->size; i++) { + SH_ELEMENT_TYPE *entry = &tb->data[i]; + + if (entry->status != SH_STATUS_IN_USE) { + startelem = i; + break; + } + } + + Assert(startelem < SH_MAX_SIZE); + + /* + * Iterate backwards, that allows the current element to be deleted, even + * if there are backward shifts + */ + iter->cur = startelem; + iter->end = iter->cur; + iter->done = false; +} + +/* + * Initialize iterator to a specific bucket. That's really only useful for + * cases where callers are partially iterating over the hashspace, and that + * iteration deletes and inserts elements based on visited entries. Doing that + * repeatedly could lead to an unbalanced keyspace when always starting at the + * same position. + */ +SH_SCOPE void SH_START_ITERATE_AT(SH_TYPE *tb, SH_ITERATOR *iter, uint32 at) +{ + /* + * Iterate backwards, that allows the current element to be deleted, even + * if there are backward shifts. + */ + iter->cur = at & tb->sizemask; /* ensure at is within a valid range */ + iter->end = iter->cur; + iter->done = false; +} + +/* + * Iterate over all entries in the hash-table. Return the next occupied entry, + * or NULL if done. + * + * During iteration the current entry in the hash table may be deleted, + * without leading to elements being skipped or returned twice. Additionally + * the rest of the table may be modified (i.e. there can be insertions or + * deletions), but if so, there's neither a guarantee that all nodes are + * visited at least once, nor a guarantee that a node is visited at most once. + */ +SH_SCOPE SH_ELEMENT_TYPE *SH_ITERATE(SH_TYPE *tb, SH_ITERATOR *iter) +{ + while (!iter->done) { + SH_ELEMENT_TYPE *elem; + + elem = &tb->data[iter->cur]; + + /* next element in backward direction */ + iter->cur = (iter->cur - 1) & tb->sizemask; + + if ((iter->cur & tb->sizemask) == (iter->end & tb->sizemask)) + iter->done = true; + if (elem->status == SH_STATUS_IN_USE) { + return elem; + } + } + + return NULL; +} + +/* + * Report some statistics about the state of the hashtable. For + * debugging/profiling purposes only. + */ +SH_SCOPE void SH_STAT(SH_TYPE *tb) +{ + uint32 max_chain_length = 0; + uint32 total_chain_length = 0; + double avg_chain_length; + double fillfactor; + uint32 i; + + uint32 *collisions = (uint32 *)palloc0(tb->size * sizeof(uint32)); + uint32 total_collisions = 0; + uint32 max_collisions = 0; + double avg_collisions; + + for (i = 0; i < tb->size; i++) { + uint32 hash; + uint32 optimal; + uint32 dist; + SH_ELEMENT_TYPE *elem; + + elem = &tb->data[i]; + + if (elem->status != SH_STATUS_IN_USE) + continue; + + hash = SH_ENTRY_HASH(tb, elem); + optimal = SH_INITIAL_BUCKET(tb, hash); + dist = SH_DISTANCE_FROM_OPTIMAL(tb, optimal, i); + + if (dist > max_chain_length) + max_chain_length = dist; + total_chain_length += dist; + + collisions[optimal]++; + } + + for (i = 0; i < tb->size; i++) { + uint32 curcoll = collisions[i]; + + if (curcoll == 0) + continue; + + /* single contained element is not a collision */ + curcoll--; + total_collisions += curcoll; + if (curcoll > max_collisions) + max_collisions = curcoll; + } + + if (tb->members > 0) { + fillfactor = tb->members / ((double)tb->size); + avg_chain_length = ((double)total_chain_length) / tb->members; + avg_collisions = ((double)total_collisions) / tb->members; + } else { + fillfactor = 0; + avg_chain_length = 0; + avg_collisions = 0; + } + + sh_log("size: " UINT64_FORMAT ", members: %u, filled: %f, total chain: %u, max chain: %u, avg chain: %f, " + "total_collisions: %u, max_collisions: %u, avg_collisions: %f", + tb->size, tb->members, fillfactor, total_chain_length, max_chain_length, avg_chain_length, total_collisions, + max_collisions, avg_collisions); +} + +#endif /* SH_DEFINE */ + +/* undefine external parameters, so next hash table can be defined */ +#undef SH_PREFIX +#undef SH_KEY_TYPE +#undef SH_KEY +#undef SH_ELEMENT_TYPE +#undef SH_HASH_KEY +#undef SH_SCOPE +#undef SH_DECLARE +#undef SH_DEFINE +#undef SH_GET_HASH +#undef SH_STORE_HASH +#undef SH_USE_NONDEFAULT_ALLOCATOR +#undef SH_EQUAL + +/* undefine locally declared macros */ +#undef SH_MAKE_PREFIX +#undef SH_MAKE_NAME +#undef SH_MAKE_NAME_ +#undef SH_FILLFACTOR +#undef SH_MAX_FILLFACTOR +#undef SH_GROW_MAX_DIB +#undef SH_GROW_MAX_MOVE +#undef SH_GROW_MIN_FILLFACTOR +#undef SH_MAX_SIZE + +/* types */ +#undef SH_TYPE +#undef SH_STATUS +#undef SH_STATUS_EMPTY +#undef SH_STATUS_IN_USE +#undef SH_ITERATOR + +/* external function names */ +#undef SH_CREATE +#undef SH_DESTROY +#undef SH_RESET +#undef SH_INSERT +#undef SH_INSERT_HASH +#undef SH_DELETE_ITEM +#undef SH_DELETE +#undef SH_LOOKUP +#undef SH_LOOKUP_HASH +#undef SH_GROW +#undef SH_START_ITERATE +#undef SH_START_ITERATE_AT +#undef SH_ITERATE +#undef SH_ALLOCATE +#undef SH_FREE +#undef SH_STAT + +/* internal function names */ +#undef SH_COMPUTE_PARAMETERS +#undef SH_COMPARE_KEYS +#undef SH_INITIAL_BUCKET +#undef SH_NEXT +#undef SH_PREV +#undef SH_DISTANCE_FROM_OPTIMAL +#undef SH_ENTRY_HASH +#undef SH_INSERT_HASH_INTERNAL +#undef SH_LOOKUP_HASH_INTERNAL \ No newline at end of file diff --git a/src/include/nodes/tidbitmap.h b/src/include/nodes/tidbitmap.h index 31cc8ad8a..9fd19dbc1 100644 --- a/src/include/nodes/tidbitmap.h +++ b/src/include/nodes/tidbitmap.h @@ -28,8 +28,8 @@ * Actual bitmap representation is private to tidbitmap.c. Callers can * do IsA(x, TIDBitmap) on it, but nothing else. */ -typedef struct TIDBitmap TIDBitmap; +typedef struct TIDBitmap TIDBitmap; /* Likewise, TBMIterator is private */ typedef struct TBMIterator TBMIterator; @@ -44,27 +44,47 @@ typedef struct { OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER]; } TBMIterateResult; +/* + * We want the caller to choose between their own best hash between + * dynamic hash and a more cache-friendly simple simple hash table. + * Therefore a set of handler is required to avoid all kinds of + * unnecessary branches inside this performance-critical area. + * + * All handlers defined here can be templated base on the hash + * table the caller used.And the caller can invoke the handler + * with little to no overheads. + * + * Most of external use of tbm related functions are exposed by + * this handler interface.Some of others like tbm_oterate does + * not templated like handlers are not included. + */ +typedef struct TMBHandler { + /* page generic handlers */ + void (*_add_tuples)(TIDBitmap*, const ItemPointer, int, bool, Oid, int2); + void (*_add_page)(TIDBitmap*, BlockNumber, Oid, int2); + + /* page operator handlers */ + void (*_union)(TIDBitmap*, const TIDBitmap*); + void (*_intersect)(TIDBitmap*, const TIDBitmap*); + + /* iterator handlers */ + TBMIterator* (*_begin_iterate)(TIDBitmap*); +} TBMHandler; + /* function prototypes in nodes/tidbitmap.c */ -extern TIDBitmap* TbmCreate(long maxbytes, bool is_ustore = false); +extern TIDBitmap* tbm_create(long maxbytes, bool is_global_part = true, bool is_crossbucket =true, bool is_ustore = false); extern void tbm_free(TIDBitmap* tbm); -extern long tbm_calculate_entries(double maxbytes); +extern long tbm_calculate_entries(double maxbytes, bool complex_key); -extern void tbm_add_tuples( - TIDBitmap* tbm, const ItemPointer tids, int ntids, bool recheck, Oid partitionOid = InvalidOid, - int2 bucketid = InvalidBktId); -extern void tbm_add_page(TIDBitmap* tbm, BlockNumber pageno, Oid partitionOid = InvalidOid, - int2 bucketid = InvalidBktId); - -extern void tbm_union(TIDBitmap* a, const TIDBitmap* b); -extern void tbm_intersect(TIDBitmap* a, const TIDBitmap* b); - -extern bool tbm_is_empty(const TIDBitmap* tbm); - -extern TBMIterator* tbm_begin_iterate(TIDBitmap* tbm); +/* iterator prototypes in nodes/tidbitmap.c */ extern TBMIterateResult* tbm_iterate(TBMIterator* iterator); extern void tbm_end_iterate(TBMIterator* iterator); + +/* function prototypes for TIDBitmap member checks */ +extern void tbm_set_global(TIDBitmap* tbm, bool val); extern bool tbm_is_global(const TIDBitmap* tbm); -extern void tbm_set_global(TIDBitmap* tbm, bool isGlobal); +extern bool tbm_is_empty(const TIDBitmap* tbm); extern bool tbm_is_crossbucket(const TIDBitmap* tbm); -extern void tbm_set_crossbucket(TIDBitmap* tbm, bool crossbucket); +extern TBMHandler tbm_get_handler(TIDBitmap* tbm); + #endif /* TIDBITMAP_H */ diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h index 940da8257..fe55c924b 100644 --- a/src/include/port/pg_bitutils.h +++ b/src/include/port/pg_bitutils.h @@ -129,6 +129,21 @@ static inline int pg_rightmost_one_pos64(uint64 word) #endif /* HAVE__BUILTIN_CTZ */ } +static inline uint64 pg_nextpower2_64(uint64 num) +{ + Assert(num > 0 && num <= PG_UINT64_MAX / 2 + 1); + + /* + * A power 2 number has only 1 bit set. Subtracting 1 from such a number + * will turn on all previous bits resulting in no common bits being set + * between num and num-1. + */ + if ((num & (num - 1)) == 0) + return num; /* already power 2 */ + + return ((uint64) 1) << (pg_leftmost_one_pos64(num) + 1); +} + /* Count the number of one-bits in a uint32 or uint64 */ extern int (*pg_popcount32)(uint32 word); extern int (*pg_popcount64)(uint64 word); diff --git a/src/include/utils/hashutils.h b/src/include/utils/hashutils.h index a27eb6b9b..bfc08716d 100644 --- a/src/include/utils/hashutils.h +++ b/src/include/utils/hashutils.h @@ -42,4 +42,10 @@ static inline uint32 murmurhash32(uint32 data) return h; } +static inline uint32 hash_combine(uint32 a, uint32 b) +{ + a ^= b + 0x9e3779b9 + (a << 6) + (a >> 2); + return a; +} + #endif /* HASHUTILS_H */ diff --git a/src/include/utils/palloc.h b/src/include/utils/palloc.h index a09420797..fa9bd4d48 100644 --- a/src/include/utils/palloc.h +++ b/src/include/utils/palloc.h @@ -72,6 +72,8 @@ extern THR_LOCAL PGDLLIMPORT MemoryContext TopMemoryContext; #define MemoryContextAllocZero(context, size) MemoryContextAllocZeroDebug(context, size, __FILE__, __LINE__) #define MemoryContextAllocZeroAligned(context, size) \ MemoryContextAllocZeroAlignedDebug(context, size, __FILE__, __LINE__) +#define MemoryContextAllocExtended(context, size, flags) \ + MemoryContextAllocExtendedDebug(context, size, flags, __FILE__, __LINE__) #define MemoryContextStrdup(context, size) MemoryContextStrdupDebug(context, size, __FILE__, __LINE__) #define repalloc(pointer, size) repallocDebug(pointer, size, __FILE__, __LINE__) #define repalloc_noexcept(pointer, size) repalloc_noexcept_Debug(pointer, size, __FILE__, __LINE__) @@ -90,6 +92,7 @@ extern void* MemoryContextAllocHugeDebug(MemoryContext context, Size size, const extern void* repallocHugeDebug(void* pointer, Size size, const char* file, int line); extern void* MemoryContextAllocZeroDebug(MemoryContext context, Size size, const char* file, int line); extern void* MemoryContextAllocZeroAlignedDebug(MemoryContext context, Size size, const char* file, int line); +extern void* MemoryContextAllocExtendedDebug(MemoryContext context, Size size, int flags, const char* file, int line); extern char* MemoryContextStrdupDebug(MemoryContext context, const char* string, const char* file, int line); extern void* MemoryContextMemalignAllocDebug(MemoryContext context, Size align, Size size, const char* file, int line); extern void MemoryContextMemalignFree(MemoryContext context, void* pointer); diff --git a/src/test/regress/output/bitmapops.source b/src/test/regress/output/bitmapops.source index 28bdabe7c..5739a8782 100644 --- a/src/test/regress/output/bitmapops.source +++ b/src/test/regress/output/bitmapops.source @@ -25,6 +25,7 @@ EXPLAIN (analyze on, costs off, timing off) SELECT count(*) FROM bmscantest WHER Aggregate (actual rows=1 loops=1) -> Bitmap Heap Scan on bmscantest (actual rows=20 loops=1) Recheck Cond: ((b = 1) AND (a = 1)) + Rows Removed by Index Recheck: 1257 --? Heap Blocks: exact=.* -> BitmapAnd (actual rows=0 loops=1) -> Bitmap Index Scan on i_bmtest_b (actual rows=1051 loops=1) @@ -32,7 +33,7 @@ EXPLAIN (analyze on, costs off, timing off) SELECT count(*) FROM bmscantest WHER -> Bitmap Index Scan on i_bmtest_a (actual rows=1170 loops=1) Index Cond: (a = 1) --?.* -(10 rows) +(11 rows) SELECT count(*) FROM bmscantest WHERE a = 1 AND b = 1; count @@ -47,7 +48,7 @@ EXPLAIN (analyze on, costs off, timing off) SELECT count(*) FROM bmscantest WHER Aggregate (actual rows=1 loops=1) -> Bitmap Heap Scan on bmscantest (actual rows=2201 loops=1) Recheck Cond: ((a = 1) OR (b = 1)) - Rows Removed by Index Recheck: 14419 + Rows Removed by Index Recheck: 17287 --? Heap Blocks: exact=.* lossy=.* -> BitmapOr (actual rows=0 loops=1) -> Bitmap Index Scan on i_bmtest_a (actual rows=1170 loops=1) diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 64ba19842..468b62679 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1479,6 +1479,9 @@ SERIALIZABLEXIDTAG SERVICE_STATUS SERVICE_STATUS_HANDLE SERVICE_TABLE_ENTRY +SH_TYPE +SH_ITERATOR +SH_STATUS SHA1_CTX SHA224_CTX SHA256_CTX