From 0a2c88b6297fda231a5ca0f922bf732a8d0445dd Mon Sep 17 00:00:00 2001 From: Vinoth Veeraraghavan Date: Wed, 23 Mar 2022 19:09:26 +0700 Subject: [PATCH] Masstree OOM feature + bug fixes More detail: 1. Feature: Add support for memory allocation failure 2. Bug fix: RCU\GC: delete node before disconnect 3. Bug fix: Leafs are marked as root by mistake 4. Bug fix: GC layer might access already freed node 5. Bug fix: Memory leak (Layers are not being removed) 6. Optimization: Disable some debug\unused code (collect new nodes) 7. Optimization: Disable phantom epoch support (not in use by MOT) 8. Optimization: Extend node version to 64bit 9. Optimization: Optimize leaf size to support larger internal ksuffix --- kvthread.hh | 96 ++++++++++++++++++++++++++++++++----- masstree.hh | 5 +- masstree_insert.hh | 78 +++++++++++++++++++++++++++---- masstree_remove.hh | 17 +++++-- masstree_scan.hh | 4 +- masstree_split.hh | 122 +++++++++++++++++++++++++++++++++++++++++++----- masstree_struct.hh | 72 +++++++++++++++++++++------- masstree_tcursor.hh | 31 ++++++------ mot_masstree_config.hpp | 11 ++++- 9 files changed, 365 insertions(+), 71 deletions(-) diff --git a/kvthread.hh b/kvthread.hh index 2c75e4e..364cd44 100644 --- a/kvthread.hh +++ b/kvthread.hh @@ -24,6 +24,48 @@ #include #include #include +#include + +enum { + MT_MERR_OK = 0, + MT_MERR_MAKE_SPLIT_PRE_ALLOC = 1, + MT_MERR_MAKE_SPLIT_LEAF_ALLOC = 2, + MT_MERR_MAKE_NEW_LAYER_LEAF_ALLOC_1 = 3, + MT_MERR_MAKE_NEW_LAYER_LEAF_ALLOC_2 = 4, + MT_MERR_MAKE_NEW_LAYER_KSUFFIX_ALLOC_1 = 5, + MT_MERR_MAKE_NEW_LAYER_KSUFFIX_ALLOC_2 = 6, + MT_MERR_FIND_INSERT_ASSIGN_SUFFIX = 7, + MT_MERR_SPLIT_INTO_ASSIGN_INITALIZE_1 = 8, + MT_MERR_SPLIT_INTO_ASSIGN_INITALIZE_2 = 9, + MT_MERR_GC_LAYER_REMOVAL_MAKE = 10, + MT_MERR_MAKE_SPLIT_ASSIGN_SUFFIX = 11, + MT_MERR_MAKE_SPLIT_PERM_EXCHANGE = 12, + + // Errors that are being handled internally (Operation should succeed even if last error contains them) + MT_MERR_NON_DISRUPTIVE_ERRORS = 15, + + // We should not reach the following errors as they should be covered with other errors in more upper layer + MT_MERR_NOT_RETURNED_TO_USER_ERRORS = 20, + MT_MERR_ASSIGN_KSUF = 21, + MT_MERR_MAKE_LEAF = 22, + MT_MERR_MAKE_ROOT_LEAF = 23, + MT_MERR_MAKE_INTERNODE = 24, + MT_MERR_LEAF_ASSIGN = 25, + MT_MERR_ASSIGN_INITALIZE_1 = 26, + MT_MERR_ASSIGN_INITALIZE_2 = 27, + + // We should not reach the following errors + MT_MERR_UNREACHABLE_ERRORS = 30, + MT_MERR_MAKE_SPLIT_INTERNODE_ALLOC_NOT_EXPECTED, + MT_MERR_MAKE_SPLIT_INTERNODE_ALLOC_NOT_EXPECTED_2, + MT_MERR_MAKE_SPLIT_INTERNODE_ALLOC_EMPTY_PRE_ALLOC_NOT_EXPECTED, + MT_MERR_MAKE_SPLIT_INTERNODE_ALLOC_EMPTY_PRE_ALLOC_NOT_EXPECTED_2, + + MT_MERR_NOT_IN_USE_LAST_ENTRY = 40 +}; + +#define MAX_ALLOC_ERROR_TYPES MT_MERR_NOT_IN_USE_LAST_ENTRY + class threadinfo; class loginfo; @@ -42,7 +84,7 @@ extern volatile mrcu_epoch_type globalepoch; // global epoch, updated regularly extern volatile mrcu_epoch_type active_epoch; // Memtags max allocation size -#define MAX_MEMTAG_MASSTREE_LEAF_ALLOCATION_SIZE iceil(sizeof(leaf

) + 128, 64) +#define MAX_MEMTAG_MASSTREE_LEAF_ALLOCATION_SIZE iceil(sizeof(leaf

) + 128, 64) #define MAX_MEMTAG_MASSTREE_INTERNODE_ALLOCATION_SIZE sizeof(internode

) #define MAX_MEMTAG_MASSTREE_LIMBO_GROUP_ALLOCATION_SIZE sizeof(mt_limbo_group) @@ -117,6 +159,12 @@ class alignas(64) threadinfo { TI_MAIN, TI_PROCESS, TI_LOG, TI_CHECKPOINT }; + typedef struct rcu_entry { + void* p; + size_t sz; + memtag tag; + } rcu_entry_t; + static threadinfo* allthreads; threadinfo* next() const { @@ -229,15 +277,14 @@ class alignas(64) threadinfo { void deallocate_rcu(void* p, size_t sz, memtag tag) { assert(p); - memdebug::check_rcu(p, sz, tag); - record_rcu(p, sz, tag); - mark(threadcounter(tc_alloc + (tag > memtag_value)), -sz); + dealloc_rcu.push_back({p, sz, tag}); } void* pool_allocate(size_t sz, memtag tag) { void* p = NULL; int nl = (sz + memdebug_size + CACHE_LINE_SIZE - 1) / CACHE_LINE_SIZE; if (use_pool()) { + masstree_invariant(false); // mot code should not reach here assert(nl <= pool_max_nlines); if (unlikely(!pool_[nl - 1])) refill_pool(nl); @@ -264,17 +311,30 @@ class alignas(64) threadinfo { *reinterpret_cast(p) = pool_[nl - 1]; pool_[nl - 1] = p; } else - free(p); + deallocate(p, sz, tag); // mot memory deallocation mark(threadcounter(tc_alloc + (tag > memtag_value)), -nl * CACHE_LINE_SIZE); } void pool_deallocate_rcu(void* p, size_t sz, memtag tag) { - int nl = (sz + memdebug_size + CACHE_LINE_SIZE - 1) / CACHE_LINE_SIZE; - assert(p && nl <= pool_max_nlines); - memdebug::check_rcu(p, sz, memtag(tag + nl)); - record_rcu(p, sz, use_pool() ? memtag(tag + nl) : tag); - mark(threadcounter(tc_alloc + (tag > memtag_value)), - -nl * CACHE_LINE_SIZE); + if (unlikely(use_pool())) { + int nl = (sz + memdebug_size + CACHE_LINE_SIZE - 1) / CACHE_LINE_SIZE; + assert(p && nl <= pool_max_nlines); + memdebug::check_rcu(p, sz, memtag(tag + nl)); + mark(threadcounter(tc_alloc + (tag > memtag_value)), + -nl * CACHE_LINE_SIZE); + dealloc_rcu.push_back({p, sz, memtag(tag + nl)}); + } else { + dealloc_rcu.push_back({p, sz, tag}); + } + } + + void add_nodes_to_gc() { + for (uint32_t i = 0 ; i < dealloc_rcu.size() ; i++) { + masstree_invariant(dealloc_rcu[i].p); + record_rcu(dealloc_rcu[i].p, dealloc_rcu[i].sz, dealloc_rcu[i].tag); + dealloc_rcu[i].p = nullptr; + } + dealloc_rcu.clear(); } // RCU @@ -308,6 +368,11 @@ class alignas(64) threadinfo { return pthreadid_; } + inline void set_last_error(int error) { masstree_invariant(error < MT_MERR_UNREACHABLE_ERRORS); last_error = error; } + inline int get_last_error() { return last_error; } + inline bool non_disruptive_error() { return last_error == 0 || + (last_error > MT_MERR_NON_DISRUPTIVE_ERRORS && last_error < MT_MERR_NOT_RETURNED_TO_USER_ERRORS); } + void report_rcu(void* ptr) const; static void report_rcu_all(void* ptr); static inline mrcu_epoch_type min_active_epoch(); @@ -333,8 +398,14 @@ class alignas(64) threadinfo { #endif } + bool is_empty_rcu_array() { + return dealloc_rcu.size() == 0; + } + private: MOT::MasstreePrimaryIndex * cur_working_index; + int last_error = MT_MERR_OK; + std::vector dealloc_rcu; union { struct { mrcu_epoch_type gc_epoch_; @@ -386,7 +457,8 @@ class alignas(64) threadinfo { void ng_record_rcu(void* ptr, int size, memtag tag); void record_rcu(void* ptr, int size, memtag tag) { - if (use_pool()) { + if (unlikely(use_pool())) { + masstree_invariant(false); // mot code should not reach here if (limbo_tail_->tail_ + 2 > limbo_tail_->capacity) refill_rcu(); uint64_t epoch = ng_getGlobalEpoch(); diff --git a/masstree.hh b/masstree.hh index eaf6503..89af0ee 100644 --- a/masstree.hh +++ b/masstree.hh @@ -42,8 +42,8 @@ template struct nodeparams { static constexpr int bound_method = bound_method_fast; static constexpr int debug_level = 0; typedef uint64_t ikey_type; - typedef uint32_t nodeversion_value_type; - static constexpr bool need_phantom_epoch = true; + typedef uint64_t nodeversion_value_type; + static constexpr bool need_phantom_epoch = false; typedef uint64_t phantom_epoch_type; static constexpr ssize_t print_max_indent_depth = 12; typedef key_unparse_printable_string key_unparse_type; @@ -95,6 +95,7 @@ class basic_table { inline node_type* root() const; inline node_type* fix_root(); + inline node_type** root_ref() { return &root_; } bool get(Str key, value_type& value, threadinfo& ti) const; diff --git a/masstree_insert.hh b/masstree_insert.hh index 4a71942..e641f03 100644 --- a/masstree_insert.hh +++ b/masstree_insert.hh @@ -21,15 +21,18 @@ namespace Masstree { template -bool tcursor

::find_insert(threadinfo& ti) +bool tcursor

::find_insert(threadinfo& ti, bool & found) { + found = false; find_locked(ti); original_n_ = n_; original_v_ = n_->full_unlocked_version_value(); // maybe we found it - if (state_) + if (state_) { + found = true; return true; + } // otherwise mark as inserted but not present state_ = 2; @@ -59,8 +62,11 @@ bool tcursor

::find_insert(threadinfo& ti) 1. If leaf is the most left leaf in the btree which means ikey0_[0] is not used as a boundary. (!n_->prev_) 2. If a new key, with ikey == ikey0_[0], is added. In this case, we can re-use slot 0 as we won't change the tree's structure. (n_->ikey_bound() == ka_.ikey()) */ if (likely(kx_.p != 0) || !n_->prev_ || n_->ikey_bound() == ka_.ikey()) { - n_->assign(kx_.p, ka_, ti); - return false; + // if n_->assign fails, we dont have enough space to place the suffix and we failed while allocating larger ksuffix. + bool res = n_->assign(kx_.p, ka_, ti); + if (!res) + ti.set_last_error(MT_MERR_FIND_INSERT_ASSIGN_SUFFIX); + return res; } } @@ -78,8 +84,13 @@ bool tcursor

::make_new_layer(threadinfo& ti) { // For each ikey_size bytes (currently 8) that matches in both key's suffixes, we will need to create a new layer leaf_type* twig_head = n_; leaf_type* twig_tail = n_; + leaf_type* nl = nullptr; while (kcmp == 0) { - leaf_type* nl = leaf_type::make_root(0, twig_tail, ti); + nl = leaf_type::make_root(0, twig_tail, ti); + if (!nl) { + ti.set_last_error(MT_MERR_MAKE_NEW_LAYER_LEAF_ALLOC_1); + goto make_new_layer_cleanup; + } nl->assign_initialize_for_layer(0, oka); if (twig_head != n_) twig_tail->lv_[0] = nl; @@ -87,7 +98,9 @@ bool tcursor

::make_new_layer(threadinfo& ti) { twig_head = nl; nl->permutation_ = permuter_type::make_sorted(1); twig_tail = nl; +#ifndef MOT_OBSOLETE_CODE new_nodes_.emplace_back(nl, nl->full_unlocked_version_value()); +#endif oka.shift(); ka_.shift(); // Compare the ikey only. if ikey matches and one or more of the suffixes != 0, compare using suffix size @@ -102,9 +115,24 @@ bool tcursor

::make_new_layer(threadinfo& ti) { + n_->iksuf_[0].overhead(n_->width); else ksufsize = 0; - leaf_type *nl = leaf_type::make_root(ksufsize, twig_tail, ti); - nl->assign_initialize(0, kcmp < 0 ? oka : ka_, ti); - nl->assign_initialize(1, kcmp < 0 ? ka_ : oka, ti); + nl = leaf_type::make_root(ksufsize, twig_tail, ti); + if (!nl) { + ti.set_last_error(MT_MERR_MAKE_NEW_LAYER_LEAF_ALLOC_2); + goto make_new_layer_cleanup; + } + // Even though the total ksuffix size was already provided to make_root, more memory might be allocated in assign_initialize calls + // as leaf internal suffix is bounded by 128 (+ 64 alignment). + // We will hit this issue (for sure) if ka_.suffix_length() + oka.suffix_length() > 192, but might hit it also when ka_.suffix_length() + oka.suffix_length() > 128. + if (!nl->assign_initialize(0, kcmp < 0 ? oka : ka_, ti)) { + ti.set_last_error(MT_MERR_MAKE_NEW_LAYER_KSUFFIX_ALLOC_1); + goto make_new_layer_cleanup; + } + + if (!nl->assign_initialize(1, kcmp < 0 ? ka_ : oka, ti)) { + ti.set_last_error(MT_MERR_MAKE_NEW_LAYER_KSUFFIX_ALLOC_2); + goto make_new_layer_cleanup; + } + nl->lv_[kcmp > 0] = n_->lv_[kx_.p]; nl->lock(*nl, ti.lock_fence(tc_leaf_lock)); if (kcmp < 0) @@ -134,6 +162,33 @@ bool tcursor

::make_new_layer(threadinfo& ti) { n_->unlock(); n_ = nl; kx_.i = kx_.p = kcmp < 0; + return true; + +make_new_layer_cleanup: + // n_ was not updated yet. It contains the original key (without any change). it will be unlocked later on (in lp.finish) + if (nl) { + // nl is not connected yet to twig_tail. handle it seperatly + nl->deallocate(ti); + nl = nullptr; + } + + // Leafs in leaf list (starts from twig_head) has no suffix. In addition, they are not connected to the masstree yet, so we dont need to hold any locks. + if (twig_head != n_) { + while (twig_head) { + masstree_invariant(!twig_head->ksuf_); + masstree_invariant(twig_head->size() == 1); + masstree_invariant(twig_head->is_layer(0)); + masstree_invariant(twig_head->stable_annotated(ti.stable_fence()).is_root()); + leaf_type *next_layer_leaf = (leaf_type *)twig_head->lv_[0].layer(); + twig_head->lv_[0] = nullptr; + // Remove it directly. no need to use rcu. + ti.deallocate(twig_head, sizeof(*twig_head) /* Being ignored */, memtag_masstree_leaf); + // Stop if we just finished to handle last leaf in list (twig_tail). + // Validating that next_layer_leaf != null wont work as twig_tail->lv_[0] == twig_tail. + twig_head = (twig_head == twig_tail) ? nullptr : next_layer_leaf; + } + } + return false; } @@ -152,15 +207,20 @@ inline void tcursor

::finish(int state, threadinfo& ti) { if (state < 0 && state_ == 1) { if (finish_remove(ti)) - return; + goto clean_ti; } else if (state > 0 && state_ == 2) finish_insert(); // we finally know this! if (n_ == original_n_) updated_v_ = n_->full_unlocked_version_value(); +#ifndef MOT_OBSOLETE_CODE else new_nodes_.emplace_back(n_, n_->full_unlocked_version_value()); +#endif n_->unlock(); + +clean_ti: + ti.add_nodes_to_gc(); } } // namespace Masstree diff --git a/masstree_remove.hh b/masstree_remove.hh index 5795261..a647ea5 100644 --- a/masstree_remove.hh +++ b/masstree_remove.hh @@ -144,8 +144,9 @@ void gc_layer_rcu_callback

::operator()(threadinfo& ti) if (!do_remove || !lp.finish_remove(ti)) { lp.n_->unlock(); } - ti.deallocate(this, size(), memtag_masstree_gc); } + ti.deallocate(this, size(), memtag_masstree_gc); + ti.add_nodes_to_gc(); } template @@ -172,18 +173,18 @@ bool tcursor

::finish_remove(threadinfo& ti) { if (perm.size()) { return false; } else { - return remove_leaf(n_, root_, ka_.prefix_string(), ti); + return remove_leaf(n_, root_ref_, ka_.prefix_string(), ti); } } template -bool tcursor

::remove_leaf(leaf_type* leaf, node_type* root, +bool tcursor

::remove_leaf(leaf_type* leaf, node_type** root_ref, Str prefix, threadinfo& ti) { if (!leaf->prev_) { if (!leaf->next_.ptr && !prefix.empty()) { // Leaf doesn't hold any keys, not in the highest layer and has no neighbors --> entire layer can be destroyed - gc_layer_rcu_callback_ng

::make(root, prefix, ti); + gc_layer_rcu_callback_ng

::make(root_ref, prefix, ti); } // Leaf has neighbor to the right (next) or leaf in the highest layer. do nothing return false; @@ -211,6 +212,14 @@ bool tcursor

::remove_leaf(leaf_type* leaf, node_type* root, // Unlink leaf from doubly-linked leaf list btree_leaflink::unlink(leaf); + // leaf->prev_ != NULL + leaf_type *prev = leaf->prev_; + if (!prev->prev_ && !prev->next_.ptr && prev->size() == 0 && !prefix.empty() ) { + // After removing the leaf, only the most left leaf remains (single leaf). We can remove the layer as the most left leaf + // doesn't hold any keys and layer is not the highest one. + gc_layer_rcu_callback_ng

::make(root_ref, prefix, ti); + } + // Remove leaf from tree, collapse trivial chains, and rewrite // ikey bounds. ikey_type ikey = leaf->ikey_bound(); diff --git a/masstree_scan.hh b/masstree_scan.hh index 31ffcbc..f7b0937 100644 --- a/masstree_scan.hh +++ b/masstree_scan.hh @@ -306,8 +306,10 @@ int scanstackelt

::find_next(H &helper, key_type &ka, leafvalue_type &entry) fence(); entry = n_->lv_[kp]; entry.prefetch(keylenx); - if (n_->keylenx_has_ksuf(keylenx)) + if (n_->keylenx_has_ksuf(keylenx)) { keylen = ka.assign_store_suffix(n_->ksuf(kp)); + masstree_invariant(keylen < (int)MASSTREE_MAXKEYLEN); + } if (n_->has_changed(v_)) goto changed; diff --git a/masstree_split.hh b/masstree_split.hh index fcf35ee..42b012b 100644 --- a/masstree_split.hh +++ b/masstree_split.hh @@ -46,7 +46,7 @@ leaf

::ikey_after_insert(const permuter_type& perm, int i, The split type is 0 if @a ka went into *this, 1 if the @a ka went into *@a nr, and 2 for the sequential-order optimization (@a ka went into *@a - nr and no other keys were moved). */ + nr and no other keys were moved). if -1, split failed due to memory issue */ template int leaf

::split_into(leaf

* nr, tcursor

* cursor, ikey_type& split_ikey, threadinfo& ti) @@ -71,7 +71,8 @@ int leaf

::split_into(leaf

* nr, tcursor

* cursor, int p = cursor->kx_.i; if (p == 0 && !this->prev_) { // reverse-sequential optimization - mid = 1; + // We remove this optimization as it can lead us to empty leaf (In case insertion fails) + // mid = 1; } else if (p == width && !this->next_.ptr) { // sequential optimization mid = width; @@ -100,9 +101,16 @@ int leaf

::split_into(leaf

* nr, tcursor

* cursor, typename permuter_type::value_type pv = perml.value_from(mid - (p < mid)); for (int x = mid; x <= width; ++x) { if (x == p) { - nr->assign_initialize(x - mid, cursor->ka_, ti); + if (!nr->assign_initialize(x - mid, cursor->ka_, ti)) { + ti.set_last_error(MT_MERR_SPLIT_INTO_ASSIGN_INITALIZE_1); + return -1; + } + } else { - nr->assign_initialize(x - mid, this, pv & 15, ti); + if (!nr->assign_initialize(x - mid, this, pv & 15, ti)) { + ti.set_last_error(MT_MERR_SPLIT_INTO_ASSIGN_INITALIZE_2); + return -1; + } pv >>= 4; } } @@ -174,6 +182,14 @@ int internode

::split_into(internode

* nr, int p, ikey_type ka, } } +template +void tcursor

::release_internodes(internode_type * internodes_array[], int start, int end, threadinfo& ti) { + for (int i = start; i < end; i++) { + masstree_invariant(internodes_array[i]); + ti.deallocate(internodes_array[i], sizeof(*internodes_array[i]) /* Being ignored */, memtag_masstree_internode); + internodes_array[i] = nullptr; + } +} template bool tcursor

::make_split(threadinfo& ti) @@ -191,17 +207,66 @@ bool tcursor

::make_split(threadinfo& ti) if (kx_.p != 0) { n_->permutation_ = perm.value(); fence(); - n_->assign(kx_.p, ka_, ti); + if (n_->assign(kx_.p, ka_, ti)) { + return true; + } + ti.set_last_error(MT_MERR_MAKE_SPLIT_PERM_EXCHANGE); + return false; + } + } + + bool rc = true; + + // 2 optimizations that can reduce the number of internodes allocations: + // 1. In n_ does not have parent, only 1 internode is required (rare case - only on first split) + // 2. In case n_'s parent has extra place, and it's height is 1, we dont need internodes at all (common case, but requires early lock of n_'s parent) + node_type* l_root = n_; + + while (!l_root->is_root()) { + if (n_ != l_root) { + l_root->stable_annotated(ti.stable_fence()); + } + l_root = l_root->maybe_parent(); + } + + // l_root->height_ is the layer real height or higher. + uint32_t layer_height = l_root->isleaf() ? 1 : ((internode_type *)l_root)->height_; + int reserved_nodes = layer_height + 5; // add 5 extra nodes (extra 5 layers in single b-tree) + internode_type * preallocated_internodes[reserved_nodes + 1] = { 0 }; + int cur_cache_index = 0; + + for (int i = 0; i < reserved_nodes; i++) { + preallocated_internodes[i] = (internode_type *)ti.pool_allocate(MAX_MEMTAG_MASSTREE_INTERNODE_ALLOCATION_SIZE, + memtag_masstree_internode); + if (!preallocated_internodes[i]) { + release_internodes(preallocated_internodes, 0, i, ti); + ti.set_last_error(MT_MERR_MAKE_SPLIT_PRE_ALLOC); return false; } } node_type* child = leaf_type::make(n_->ksuf_used_capacity(), n_->phantom_epoch(), ti); + if (!child) { + release_internodes(preallocated_internodes, 0, reserved_nodes, ti); + ti.set_last_error(MT_MERR_MAKE_SPLIT_LEAF_ALLOC); + return false; + } child->assign_version(*n_); + child->mark_nonroot(); + // As n_ is locked, child is locked as well. ikey_type xikey[2]; // Add the new key and spread the keys between the 2 leafs. The new key might be inserted to either one of the leafs. Link to parent will be done later. int split_type = n_->split_into(static_cast(child), this, xikey[0], ti); + + if (split_type < 0) { + // Split failed due to ksuffix memory allocation error (child is not connected to n_ at this stage) + release_internodes(preallocated_internodes, 0, reserved_nodes, ti); + // child is not visiable yet, so we can deallocate without rcu + ((leaf_type *)child)->deallocate(ti); + child = nullptr; + return false; + } unsigned sense = 0; node_type* n = n_; uint32_t height = 0; @@ -219,7 +284,17 @@ bool tcursor

::make_split(threadinfo& ti) } if (kp < 0 || p->height_ > height + 1) { - internode_type *nn = internode_type::make(height + 1, ti); + masstree_invariant(preallocated_internodes[cur_cache_index]); + internode_type *nn = internode_type::make(height + 1, ti, preallocated_internodes[cur_cache_index++]); + if (!nn) { + // Should never happen with pre-allocated internodes. bad flow is not handled + ti.set_last_error(MT_MERR_MAKE_SPLIT_INTERNODE_ALLOC_NOT_EXPECTED); + } + + if (cur_cache_index == reserved_nodes) { + // Should never happen with pre-allocated internodes (we should have enough reserved nodes). bad flow is not handled + ti.set_last_error(MT_MERR_MAKE_SPLIT_INTERNODE_ALLOC_EMPTY_PRE_ALLOC_NOT_EXPECTED); + } nn->child_[0] = n; nn->assign(0, xikey[sense], child); nn->nkeys_ = 1; @@ -233,11 +308,22 @@ bool tcursor

::make_split(threadinfo& ti) n->set_parent(nn); } else { if (p->size() >= p->width) { - next_child = internode_type::make(height + 1, ti); + masstree_invariant(preallocated_internodes[cur_cache_index]); + next_child = internode_type::make(height + 1, ti, preallocated_internodes[cur_cache_index++]); + if (!next_child) { + // Should never happen with pre-allocated internodes. bad flow is not handled + ti.set_last_error(MT_MERR_MAKE_SPLIT_INTERNODE_ALLOC_NOT_EXPECTED_2); + } + + if (cur_cache_index == reserved_nodes) { + // Should never happen with pre-allocated internodes (we should have enough reserved nodes). bad flow is not handled + ti.set_last_error(MT_MERR_MAKE_SPLIT_INTERNODE_ALLOC_EMPTY_PRE_ALLOC_NOT_EXPECTED_2); + } + next_child->assign_version(*p); next_child->mark_nonroot(); kp = p->split_into(next_child, kp, xikey[sense], - child, xikey[sense ^ 1], split_type); + child, xikey[sense ^ 1], split_type); // No memory allocation } if (kp >= 0) { p->shift_up(kp + 1, kp, p->size() - kp); @@ -259,16 +345,27 @@ bool tcursor

::make_split(threadinfo& ti) int width = perml.size(); perml.set_size(width - nr->size()); // removed item, if any, must be @ perml.size() + int perm_size = perml.size(); + masstree_invariant(perm_size > 0); // Verify that the leaf is not empty if (width != nl->width) { - perml.exchange(perml.size(), nl->width - 1); + perml.exchange(perm_size, nl->width - 1); } nl->mark_split(); nl->permutation_ = perml.value(); // account for split if (split_type == 0) { kx_.p = perml.back(); - nl->assign(kx_.p, ka_, ti); + + // In case the new inserted key should be placed in the origianl leaf (left leaf), memory allocation might be needed for it's ksuffix. + // If assign fails (--> memory allocation failure), the flow will continue, but we mark rc as false to indicate that the insertion failed. + // In this case, the key wont be exposed in finish_insert(), but the leaf split will be completed successfully. + if (!nl->assign(kx_.p, ka_, ti)) { + ti.set_last_error(MT_MERR_MAKE_SPLIT_ASSIGN_SUFFIX); + rc = false; + } +#ifndef MOT_OBSOLETE_CODE new_nodes_.emplace_back(nr, nr->full_unlocked_version_value()); +#endif } else { kx_.i = kx_.p = kx_.i - perml.size(); n_ = nr; @@ -296,7 +393,10 @@ bool tcursor

::make_split(threadinfo& ti) } } - return false; + // Free unused pre-allocated internodes + release_internodes(preallocated_internodes, cur_cache_index, reserved_nodes, ti); + + return rc; } } // namespace Masstree diff --git a/masstree_struct.hh b/masstree_struct.hh index 8f121a9..1b5d853 100644 --- a/masstree_struct.hh +++ b/masstree_struct.hh @@ -120,9 +120,15 @@ class internode : public node_base

{ : node_base

(false), nkeys_(0), height_(height), parent_() { } - static internode

* make(uint32_t height, threadinfo& ti) { - void* ptr = ti.pool_allocate(MAX_MEMTAG_MASSTREE_INTERNODE_ALLOCATION_SIZE, + static internode

* make(uint32_t height, threadinfo& ti, void * allocated_internode = nullptr) { + void* ptr = allocated_internode ? + allocated_internode : + ti.pool_allocate(MAX_MEMTAG_MASSTREE_INTERNODE_ALLOCATION_SIZE, memtag_masstree_internode); + if (!ptr) { + ti.set_last_error(MT_MERR_MAKE_INTERNODE); + return nullptr; + } internode

* n = new(ptr) internode

(height); assert(n); if (P::debug_level > 0) @@ -319,8 +325,12 @@ class leaf : public node_base

{ } static leaf

* make(int ksufsize, phantom_epoch_type phantom_epoch, threadinfo& ti) { - size_t sz = iceil(sizeof(leaf

) + std::min(ksufsize, 128), 64); + size_t sz = MAX_MEMTAG_MASSTREE_LEAF_ALLOCATION_SIZE; // iceil(sizeof(leaf

) + std::min(ksufsize, 128), 64); void* ptr = ti.pool_allocate(sz, memtag_masstree_leaf); + if (!ptr) { + ti.set_last_error(MT_MERR_MAKE_LEAF); + return nullptr; + } leaf

* n = new(ptr) leaf

(sz, phantom_epoch); assert(n); if (P::debug_level > 0) { @@ -330,6 +340,10 @@ class leaf : public node_base

{ } static leaf

* make_root(int ksufsize, leaf

* parent, threadinfo& ti) { leaf

* n = make(ksufsize, parent ? parent->phantom_epoch() : phantom_epoch_type(), ti); + if (!n) { + ti.set_last_error(MT_MERR_MAKE_ROOT_LEAF); + return nullptr; + } n->next_.ptr = n->prev_ = 0; n->ikey0_[0] = 0; // to avoid undefined behavior n->make_layer_root(); @@ -413,7 +427,9 @@ class leaf : public node_base

{ } Str ksuf(int p, int keylenx) const { (void) keylenx; - masstree_precondition(keylenx_has_ksuf(keylenx)); + // keylenx might not be equal to ksuf_keylenx as this operation might be called without holding leaf's lock + // We allow it, and expect the caller to validate leaf's version and retry. + //masstree_precondition(keylenx_has_ksuf(keylenx)); return ksuf_ ? ksuf_->get(p) : iksuf_[0].get(p); } Str ksuf(int p) const { @@ -429,7 +445,7 @@ class leaf : public node_base

{ return s.len == ka.suffix().len && string_slice::equals_sloppy(s.s, ka.suffix().s, s.len); } - // Returns 1 if match & not layer, 0 if no match, <0 if match and layer + // Returns 1 if match & not layer, 0 if no match, < 0 if match and layer int ksuf_matches(int p, const key_type& ka) const { int keylenx = keylenx_[p]; if (keylenx < ksuf_keylenx) @@ -520,40 +536,55 @@ class leaf : public node_base

{ modstate_ = modstate_deleted_layer; } - inline void assign(int p, const key_type& ka, threadinfo& ti) { + inline bool assign(int p, const key_type& ka, threadinfo& ti) { lv_[p] = leafvalue_type::make_empty(); ikey0_[p] = ka.ikey(); if (!ka.has_suffix()) { keylenx_[p] = ka.length(); } else { keylenx_[p] = ksuf_keylenx; - assign_ksuf(p, ka.suffix(), false, ti); + if (!assign_ksuf(p, ka.suffix(), false, ti)) { + ti.set_last_error(MT_MERR_LEAF_ASSIGN); + return false; + } } + + return true; } - inline void assign_initialize(int p, const key_type& ka, threadinfo& ti) { + inline bool assign_initialize(int p, const key_type& ka, threadinfo& ti) { lv_[p] = leafvalue_type::make_empty(); ikey0_[p] = ka.ikey(); if (!ka.has_suffix()) { keylenx_[p] = ka.length(); } else { keylenx_[p] = ksuf_keylenx; - assign_ksuf(p, ka.suffix(), true, ti); + if (!assign_ksuf(p, ka.suffix(), true, ti)) { + ti.set_last_error(MT_MERR_ASSIGN_INITALIZE_1); + return false; + } } + + return true; } - inline void assign_initialize(int p, leaf

* x, int xp, threadinfo& ti) { + inline bool assign_initialize(int p, leaf

* x, int xp, threadinfo& ti) { lv_[p] = x->lv_[xp]; ikey0_[p] = x->ikey0_[xp]; keylenx_[p] = x->keylenx_[xp]; if (x->has_ksuf(xp)) { - assign_ksuf(p, x->ksuf(xp), true, ti); + if (!assign_ksuf(p, x->ksuf(xp), true, ti)) { + ti.set_last_error(MT_MERR_ASSIGN_INITALIZE_2); + return false; + } } + + return true; } inline void assign_initialize_for_layer(int p, const key_type& ka) { assert(ka.has_suffix()); ikey0_[p] = ka.ikey(); keylenx_[p] = layer_keylenx; } - void assign_ksuf(int p, Str s, bool initializing, threadinfo& ti); + bool assign_ksuf(int p, Str s, bool initializing, threadinfo& ti); inline ikey_type ikey_after_insert(const permuter_type& perm, int i, const tcursor

* cursor) const; @@ -763,14 +794,14 @@ leaf

* leaf

::advance_to_key(const key_type& ka, nodeversion_type& v, positions [0,p) are ready: keysuffixes in that range are copied. In either case, the key at position p is NOT copied; it is assigned to @a s. */ template -void leaf

::assign_ksuf(int p, Str s, bool initializing, threadinfo& ti) { +bool leaf

::assign_ksuf(int p, Str s, bool initializing, threadinfo& ti) { if ((ksuf_ && ksuf_->assign(p, s)) || (extrasize64_ > 0 && iksuf_[0].assign(p, s))) { #if !(defined(__x86_64__) || defined(__x86__)) fence(); -#endif - return; +#endif + return true; } external_ksuf_type* oksuf = ksuf_; @@ -796,15 +827,19 @@ void leaf

::assign_ksuf(int p, Str s, bool initializing, threadinfo& ti) { sz = std::max(sz, oksuf->capacity()); void* ptr = ti.allocate(sz, memtag_masstree_ksuffixes, &sz); + if (!ptr) { + ti.set_last_error(MT_MERR_ASSIGN_KSUF); + return false; + } external_ksuf_type* nksuf = new(ptr) external_ksuf_type(width, sz); for (int i = 0; i < n; ++i) { int mp = initializing ? i : perm[i]; if (mp != p && has_ksuf(mp)) { - bool ok = nksuf->assign(mp, ksuf(mp)); + bool ok = nksuf->assign(mp, ksuf(mp)); // No memory allocation here assert(ok); (void) ok; } } - bool ok = nksuf->assign(p, s); + bool ok = nksuf->assign(p, s); // No memory allocation here assert(ok); (void) ok; fence(); @@ -824,11 +859,12 @@ void leaf

::assign_ksuf(int p, Str s, bool initializing, threadinfo& ti) { if (oksuf) ti.deallocate_rcu(oksuf, oksuf->capacity(), memtag_masstree_ksuffixes); + return true; } template inline basic_table

::basic_table() - : root_(0) { + : root_(nullptr) { } template diff --git a/masstree_tcursor.hh b/masstree_tcursor.hh index 2442c9f..755588d 100644 --- a/masstree_tcursor.hh +++ b/masstree_tcursor.hh @@ -106,22 +106,23 @@ class tcursor { static constexpr int new_nodes_size = 1; // unless we make a new trie newnodes will have at most 1 item typedef small_vector, new_nodes_size> new_nodes_type; +#ifndef MOT_OBSOLETE_CODE tcursor(basic_table

& table, Str str) : ka_(str), root_(table.fix_root()) { } tcursor(basic_table

& table, const char* s, int len) : ka_(s, len), root_(table.fix_root()) { } - tcursor(basic_table

& table, const unsigned char* s, int len) - : ka_(reinterpret_cast(s), len), root_(table.fix_root()) { - } - tcursor(node_base

* root, const char* s, int len) - : ka_(s, len), root_(root) { - } tcursor(node_base

* root, const unsigned char* s, int len) : ka_(reinterpret_cast(s), len), root_(root) { } - +#endif + tcursor(basic_table

& table, const unsigned char* s, int len) + : ka_(reinterpret_cast(s), len), root_(table.fix_root()), root_ref_(table.root_ref()) { + } + tcursor(node_base

** root_ref, const char* s, int len) + : ka_(s, len), root_(*root_ref), root_ref_(root_ref) { + } inline bool has_value() const { return kx_.p >= 0; } @@ -148,13 +149,13 @@ class tcursor { inline nodeversion_value_type updated_version_value() const { return updated_v_; } - +#ifndef MOT_OBSOLETE_CODE inline const new_nodes_type &new_nodes() const { return new_nodes_; } - +#endif inline bool find_locked(threadinfo& ti); - inline bool find_insert(threadinfo& ti); + inline bool find_insert(threadinfo& ti, bool & found); inline void finish(int answer, threadinfo& ti); @@ -166,13 +167,16 @@ class tcursor { key_type ka_; key_indexed_position kx_; node_base

* root_; + node_base

** root_ref_; int state_; - leaf_type* original_n_; + leaf_type* original_n_ = nullptr; nodeversion_value_type original_v_; nodeversion_value_type updated_v_; - new_nodes_type new_nodes_; +#ifndef MOT_OBSOLETE_CODE + new_nodes_type new_nodes_; +#endif inline node_type* reset_retry() { ka_.unshift_all(); return root_; @@ -180,6 +184,7 @@ class tcursor { bool make_new_layer(threadinfo& ti); bool make_split(threadinfo& ti); + void release_internodes(internode_type * internodes_array[], int start, int end, threadinfo& ti); friend class leaf

; inline void finish_insert(); inline bool finish_remove(threadinfo& ti); @@ -191,7 +196,7 @@ class tcursor { * If removing a leaf in layer 0, @a prefix is empty. * If removing, for example, the node containing key "01234567ABCDEF" in the layer-1 tree * rooted at "01234567", then @a prefix should equal "01234567". */ - static bool remove_leaf(leaf_type* leaf, node_type* root, + static bool remove_leaf(leaf_type* leaf, node_type** root_ref, Str prefix, threadinfo& ti); bool gc_layer(threadinfo& ti); diff --git a/mot_masstree_config.hpp b/mot_masstree_config.hpp index bec2ec8..fe3930b 100644 --- a/mot_masstree_config.hpp +++ b/mot_masstree_config.hpp @@ -25,6 +25,9 @@ #ifndef MOT_MASSTREE_CONFIG_HPP #define MOT_MASSTREE_CONFIG_HPP +// Ignore masstree code which is obsolete in MOT +#define MOT_OBSOLETE_CODE 1 + #define MOT_HAVE_CXX_TEMPLATE_ALIAS 1 #define MOT_HAVE_INT64_T_IS_LONG 1 #define MOT_HAVE_SIZE_T_IS_UNSIGNED_LONG 1 @@ -58,7 +61,7 @@ #define MOT_SIZEOF_LONG_LONG 8 #define MOT_SIZEOF_SHORT 2 #define MOT_WORDS_BIGENDIAN_SET 1 - +/* #define masstree_invariant(x, ...) \ do { \ } while (0) @@ -66,6 +69,12 @@ #define masstree_precondition(x, ...) \ do { \ } while (0) +*/ + +#define masstree_invariant(x, ...) assert(x) +#define masstree_precondition(x, ...) assert(x) + + #ifndef invariant #define invariant masstree_invariant -- 1.8.3.1