From 0a2c88b6297fda231a5ca0f922bf732a8d0445dd Mon Sep 17 00:00:00 2001
From: Vinoth Veeraraghavan <vinoth.veeraraghavan@hotmail.com>
Date: Wed, 23 Mar 2022 19:09:26 +0700
Subject: [PATCH] Masstree OOM feature + bug fixes

More detail: 1. Feature: Add support for memory allocation failure
             2. Bug fix: RCU\GC: delete node before disconnect
             3. Bug fix: Leafs are marked as root by mistake
             4. Bug fix: GC layer might access already freed node
             5. Bug fix: Memory leak (Layers are not being removed)
             6. Optimization: Disable some debug\unused code (collect new nodes)
             7. Optimization: Disable phantom epoch support (not in use by MOT)
             8. Optimization: Extend node version to 64bit
             9. Optimization: Optimize leaf size to support larger internal ksuffix
---
 kvthread.hh             |  96 ++++++++++++++++++++++++++++++++-----
 masstree.hh             |   5 +-
 masstree_insert.hh      |  78 +++++++++++++++++++++++++++----
 masstree_remove.hh      |  17 +++++--
 masstree_scan.hh        |   4 +-
 masstree_split.hh       | 122 +++++++++++++++++++++++++++++++++++++++++++-----
 masstree_struct.hh      |  72 +++++++++++++++++++++-------
 masstree_tcursor.hh     |  31 ++++++------
 mot_masstree_config.hpp |  11 ++++-
 9 files changed, 365 insertions(+), 71 deletions(-)
diff --git a/kvthread.hh b/kvthread.hh
index 2c75e4e..364cd44 100644
--- a/kvthread.hh
+++ b/kvthread.hh
@@ -24,6 +24,48 @@
 #include <pthread.h>
 #include <sys/mman.h>
 #include <stdlib.h>
+#include <vector>
+
+enum {
+    MT_MERR_OK = 0,
+    MT_MERR_MAKE_SPLIT_PRE_ALLOC = 1,
+    MT_MERR_MAKE_SPLIT_LEAF_ALLOC = 2,
+    MT_MERR_MAKE_NEW_LAYER_LEAF_ALLOC_1 = 3,
+    MT_MERR_MAKE_NEW_LAYER_LEAF_ALLOC_2 = 4,
+    MT_MERR_MAKE_NEW_LAYER_KSUFFIX_ALLOC_1 = 5,
+    MT_MERR_MAKE_NEW_LAYER_KSUFFIX_ALLOC_2 = 6,
+    MT_MERR_FIND_INSERT_ASSIGN_SUFFIX = 7,
+    MT_MERR_SPLIT_INTO_ASSIGN_INITALIZE_1 = 8,
+    MT_MERR_SPLIT_INTO_ASSIGN_INITALIZE_2 = 9,
+    MT_MERR_GC_LAYER_REMOVAL_MAKE = 10,
+    MT_MERR_MAKE_SPLIT_ASSIGN_SUFFIX = 11,
+    MT_MERR_MAKE_SPLIT_PERM_EXCHANGE = 12,
+
+    // Errors that are being handled internally (Operation should succeed even if last error contains them)
+    MT_MERR_NON_DISRUPTIVE_ERRORS = 15,
+
+    // We should not reach the following errors as they should be covered with other errors in more upper layer
+    MT_MERR_NOT_RETURNED_TO_USER_ERRORS = 20,
+    MT_MERR_ASSIGN_KSUF = 21,
+    MT_MERR_MAKE_LEAF = 22,
+    MT_MERR_MAKE_ROOT_LEAF = 23,
+    MT_MERR_MAKE_INTERNODE = 24,
+    MT_MERR_LEAF_ASSIGN = 25,
+    MT_MERR_ASSIGN_INITALIZE_1 = 26,
+    MT_MERR_ASSIGN_INITALIZE_2 = 27,
+
+    // We should not reach the following errors
+    MT_MERR_UNREACHABLE_ERRORS = 30,
+    MT_MERR_MAKE_SPLIT_INTERNODE_ALLOC_NOT_EXPECTED,
+    MT_MERR_MAKE_SPLIT_INTERNODE_ALLOC_NOT_EXPECTED_2,
+    MT_MERR_MAKE_SPLIT_INTERNODE_ALLOC_EMPTY_PRE_ALLOC_NOT_EXPECTED,
+    MT_MERR_MAKE_SPLIT_INTERNODE_ALLOC_EMPTY_PRE_ALLOC_NOT_EXPECTED_2,
+
+    MT_MERR_NOT_IN_USE_LAST_ENTRY = 40
+};
+
+#define MAX_ALLOC_ERROR_TYPES MT_MERR_NOT_IN_USE_LAST_ENTRY
+
 
 class threadinfo;
 class loginfo;
@@ -42,7 +84,7 @@ extern volatile mrcu_epoch_type globalepoch;  // global epoch, updated regularly
 extern volatile mrcu_epoch_type active_epoch;
 
 // Memtags max allocation size
-#define MAX_MEMTAG_MASSTREE_LEAF_ALLOCATION_SIZE        iceil(sizeof(leaf<P>) +  128, 64)
+#define MAX_MEMTAG_MASSTREE_LEAF_ALLOCATION_SIZE        iceil(sizeof(leaf<P>) + 128, 64)
 #define MAX_MEMTAG_MASSTREE_INTERNODE_ALLOCATION_SIZE   sizeof(internode<P>)
 #define MAX_MEMTAG_MASSTREE_LIMBO_GROUP_ALLOCATION_SIZE sizeof(mt_limbo_group)
 
@@ -117,6 +159,12 @@ class alignas(64) threadinfo {
         TI_MAIN, TI_PROCESS, TI_LOG, TI_CHECKPOINT
     };
 
+    typedef struct rcu_entry {
+        void* p;
+        size_t sz;
+        memtag tag;
+    } rcu_entry_t;
+
     static threadinfo* allthreads;
 
     threadinfo* next() const {
@@ -229,15 +277,14 @@ class alignas(64) threadinfo {
 
     void deallocate_rcu(void* p, size_t sz, memtag tag) {
         assert(p);
-        memdebug::check_rcu(p, sz, tag);
-        record_rcu(p, sz, tag);
-        mark(threadcounter(tc_alloc + (tag > memtag_value)), -sz);
+        dealloc_rcu.push_back({p, sz, tag});
     }
 
     void* pool_allocate(size_t sz, memtag tag) {
         void* p = NULL;
         int nl = (sz + memdebug_size + CACHE_LINE_SIZE - 1) / CACHE_LINE_SIZE;
         if (use_pool()) {
+            masstree_invariant(false); // mot code should not reach here
             assert(nl <= pool_max_nlines);
             if (unlikely(!pool_[nl - 1]))
                 refill_pool(nl);
@@ -264,17 +311,30 @@ class alignas(64) threadinfo {
             *reinterpret_cast<void **>(p) = pool_[nl - 1];
             pool_[nl - 1] = p;
         } else
-            free(p);
+            deallocate(p, sz, tag); // mot memory deallocation
         mark(threadcounter(tc_alloc + (tag > memtag_value)),
              -nl * CACHE_LINE_SIZE);
     }
     void pool_deallocate_rcu(void* p, size_t sz, memtag tag) {
-        int nl = (sz + memdebug_size + CACHE_LINE_SIZE - 1) / CACHE_LINE_SIZE;
-        assert(p && nl <= pool_max_nlines);
-        memdebug::check_rcu(p, sz, memtag(tag + nl));
-        record_rcu(p, sz, use_pool() ? memtag(tag + nl) : tag);
-        mark(threadcounter(tc_alloc + (tag > memtag_value)),
-             -nl * CACHE_LINE_SIZE);
+        if (unlikely(use_pool())) {
+          int nl = (sz + memdebug_size + CACHE_LINE_SIZE - 1) / CACHE_LINE_SIZE;
+          assert(p && nl <= pool_max_nlines);
+          memdebug::check_rcu(p, sz, memtag(tag + nl));
+          mark(threadcounter(tc_alloc + (tag > memtag_value)),
+               -nl * CACHE_LINE_SIZE);
+          dealloc_rcu.push_back({p, sz, memtag(tag + nl)});
+        } else {
+          dealloc_rcu.push_back({p, sz, tag});
+        }
+    }
+
+    void add_nodes_to_gc() {
+        for (uint32_t i = 0 ; i < dealloc_rcu.size() ; i++) {
+            masstree_invariant(dealloc_rcu[i].p);
+            record_rcu(dealloc_rcu[i].p, dealloc_rcu[i].sz, dealloc_rcu[i].tag);
+            dealloc_rcu[i].p = nullptr;
+        }
+        dealloc_rcu.clear();
     }
 
     // RCU
@@ -308,6 +368,11 @@ class alignas(64) threadinfo {
         return pthreadid_;
     }
 
+    inline void set_last_error(int error) { masstree_invariant(error < MT_MERR_UNREACHABLE_ERRORS); last_error = error; }
+    inline int get_last_error() { return last_error; }
+    inline bool non_disruptive_error() { return last_error == 0 ||
+                                     (last_error > MT_MERR_NON_DISRUPTIVE_ERRORS && last_error < MT_MERR_NOT_RETURNED_TO_USER_ERRORS); }
+
     void report_rcu(void* ptr) const;
     static void report_rcu_all(void* ptr);
     static inline mrcu_epoch_type min_active_epoch();
@@ -333,8 +398,14 @@ class alignas(64) threadinfo {
 #endif
     }
 
+    bool is_empty_rcu_array() {
+        return dealloc_rcu.size() == 0;
+    }
+
   private:
     MOT::MasstreePrimaryIndex * cur_working_index;
+    int last_error = MT_MERR_OK;
+    std::vector<struct rcu_entry> dealloc_rcu;
     union {
         struct {
             mrcu_epoch_type gc_epoch_;
@@ -386,7 +457,8 @@ class alignas(64) threadinfo {
     void ng_record_rcu(void* ptr, int size, memtag tag);
 
     void record_rcu(void* ptr, int size, memtag tag) {
-      if (use_pool()) {
+      if (unlikely(use_pool())) {
+        masstree_invariant(false); // mot code should not reach here
         if (limbo_tail_->tail_ + 2 > limbo_tail_->capacity)
           refill_rcu();
         uint64_t epoch = ng_getGlobalEpoch();
diff --git a/masstree.hh b/masstree.hh
index eaf6503..89af0ee 100644
--- a/masstree.hh
+++ b/masstree.hh
@@ -42,8 +42,8 @@ template <int LW = 15, int IW = LW> struct nodeparams {
     static constexpr int bound_method = bound_method_fast;
     static constexpr int debug_level = 0;
     typedef uint64_t ikey_type;
-    typedef uint32_t nodeversion_value_type;
-    static constexpr bool need_phantom_epoch = true;
+    typedef uint64_t nodeversion_value_type;
+    static constexpr bool need_phantom_epoch = false;
     typedef uint64_t phantom_epoch_type;
     static constexpr ssize_t print_max_indent_depth = 12;
     typedef key_unparse_printable_string key_unparse_type;
@@ -95,6 +95,7 @@ class basic_table {
 
     inline node_type* root() const;
     inline node_type* fix_root();
+    inline node_type** root_ref() { return &root_; }
 
     bool get(Str key, value_type& value, threadinfo& ti) const;
 
diff --git a/masstree_insert.hh b/masstree_insert.hh
index 4a71942..e641f03 100644
--- a/masstree_insert.hh
+++ b/masstree_insert.hh
@@ -21,15 +21,18 @@
 namespace Masstree {
 
 template <typename P>
-bool tcursor<P>::find_insert(threadinfo& ti)
+bool tcursor<P>::find_insert(threadinfo& ti, bool & found)
 {
+    found = false;
     find_locked(ti);
     original_n_ = n_;
     original_v_ = n_->full_unlocked_version_value();
 
     // maybe we found it
-    if (state_)
+    if (state_) {
+        found = true;
         return true;
+    }
 
     // otherwise mark as inserted but not present
     state_ = 2;
@@ -59,8 +62,11 @@ bool tcursor<P>::find_insert(threadinfo& ti)
              1. If leaf is the most left leaf in the btree which means ikey0_[0] is not used as a boundary. (!n_->prev_)
              2. If a new key, with ikey == ikey0_[0], is added. In this case, we can re-use slot 0 as we won't change the tree's structure. (n_->ikey_bound() == ka_.ikey()) */
         if (likely(kx_.p != 0) || !n_->prev_ || n_->ikey_bound() == ka_.ikey()) {
-            n_->assign(kx_.p, ka_, ti);
-            return false;
+            // if n_->assign fails, we dont have enough space to place the suffix and we failed while allocating larger ksuffix.
+            bool res = n_->assign(kx_.p, ka_, ti);
+            if (!res)
+                ti.set_last_error(MT_MERR_FIND_INSERT_ASSIGN_SUFFIX);
+            return res;
         }
     }
 
@@ -78,8 +84,13 @@ bool tcursor<P>::make_new_layer(threadinfo& ti) {
     // For each ikey_size bytes (currently 8) that matches in both key's suffixes, we will need to create a new layer
     leaf_type* twig_head = n_;
     leaf_type* twig_tail = n_;
+    leaf_type* nl = nullptr;
     while (kcmp == 0) {
-        leaf_type* nl = leaf_type::make_root(0, twig_tail, ti);
+        nl = leaf_type::make_root(0, twig_tail, ti);
+        if (!nl) {
+            ti.set_last_error(MT_MERR_MAKE_NEW_LAYER_LEAF_ALLOC_1);
+            goto make_new_layer_cleanup;
+        }
         nl->assign_initialize_for_layer(0, oka);
         if (twig_head != n_)
             twig_tail->lv_[0] = nl;
@@ -87,7 +98,9 @@ bool tcursor<P>::make_new_layer(threadinfo& ti) {
             twig_head = nl;
         nl->permutation_ = permuter_type::make_sorted(1);
         twig_tail = nl;
+#ifndef MOT_OBSOLETE_CODE
         new_nodes_.emplace_back(nl, nl->full_unlocked_version_value());
+#endif
         oka.shift();
         ka_.shift();
         // Compare the ikey only. if ikey matches and one or more of the suffixes != 0, compare using suffix size
@@ -102,9 +115,24 @@ bool tcursor<P>::make_new_layer(threadinfo& ti) {
             + n_->iksuf_[0].overhead(n_->width);
     else
         ksufsize = 0;
-    leaf_type *nl = leaf_type::make_root(ksufsize, twig_tail, ti);
-    nl->assign_initialize(0, kcmp < 0 ? oka : ka_, ti);
-    nl->assign_initialize(1, kcmp < 0 ? ka_ : oka, ti);
+    nl = leaf_type::make_root(ksufsize, twig_tail, ti);
+    if (!nl) {
+        ti.set_last_error(MT_MERR_MAKE_NEW_LAYER_LEAF_ALLOC_2);
+        goto make_new_layer_cleanup;
+    }
+    // Even though the total ksuffix size was already provided to make_root, more memory might be allocated in assign_initialize calls
+    //  as leaf internal suffix is bounded by 128 (+ 64 alignment).
+    // We will hit this issue (for sure) if ka_.suffix_length() + oka.suffix_length() > 192, but might hit it also when ka_.suffix_length() + oka.suffix_length() > 128.
+    if (!nl->assign_initialize(0, kcmp < 0 ? oka : ka_, ti)) {
+        ti.set_last_error(MT_MERR_MAKE_NEW_LAYER_KSUFFIX_ALLOC_1);
+        goto make_new_layer_cleanup;
+    }
+
+    if (!nl->assign_initialize(1, kcmp < 0 ? ka_ : oka, ti)) {
+        ti.set_last_error(MT_MERR_MAKE_NEW_LAYER_KSUFFIX_ALLOC_2);
+        goto make_new_layer_cleanup;
+    }
+
     nl->lv_[kcmp > 0] = n_->lv_[kx_.p];
     nl->lock(*nl, ti.lock_fence(tc_leaf_lock));
     if (kcmp < 0)
@@ -134,6 +162,33 @@ bool tcursor<P>::make_new_layer(threadinfo& ti) {
     n_->unlock();
     n_ = nl;
     kx_.i = kx_.p = kcmp < 0;
+    return true;
+
+make_new_layer_cleanup:
+    // n_ was not updated yet. It contains the original key (without any change). it will be unlocked later on (in lp.finish)
+    if (nl) {
+        // nl is not connected yet to twig_tail. handle it seperatly
+        nl->deallocate(ti);
+        nl = nullptr;
+    }
+
+    // Leafs in leaf list (starts from twig_head) has no suffix. In addition, they are not connected to the masstree yet, so we dont need to hold any locks.
+    if (twig_head != n_) {
+        while (twig_head) {
+            masstree_invariant(!twig_head->ksuf_);
+            masstree_invariant(twig_head->size() == 1);
+            masstree_invariant(twig_head->is_layer(0));
+            masstree_invariant(twig_head->stable_annotated(ti.stable_fence()).is_root());
+            leaf_type *next_layer_leaf = (leaf_type *)twig_head->lv_[0].layer();
+            twig_head->lv_[0] = nullptr;
+            // Remove it directly. no need to use rcu.
+            ti.deallocate(twig_head, sizeof(*twig_head) /* Being ignored */, memtag_masstree_leaf);
+            // Stop if we just finished to handle last leaf in list (twig_tail).
+            // Validating that next_layer_leaf != null wont work as twig_tail->lv_[0] == twig_tail.
+            twig_head = (twig_head == twig_tail) ? nullptr : next_layer_leaf;
+        }
+    }
+
     return false;
 }
 
@@ -152,15 +207,20 @@ inline void tcursor<P>::finish(int state, threadinfo& ti)
 {
     if (state < 0 && state_ == 1) {
         if (finish_remove(ti))
-            return;
+            goto clean_ti;
     } else if (state > 0 && state_ == 2)
         finish_insert();
     // we finally know this!
     if (n_ == original_n_)
         updated_v_ = n_->full_unlocked_version_value();
+#ifndef MOT_OBSOLETE_CODE
     else
         new_nodes_.emplace_back(n_, n_->full_unlocked_version_value());
+#endif
     n_->unlock();
+
+clean_ti:
+    ti.add_nodes_to_gc();
 }
 
 } // namespace Masstree
diff --git a/masstree_remove.hh b/masstree_remove.hh
index 5795261..a647ea5 100644
--- a/masstree_remove.hh
+++ b/masstree_remove.hh
@@ -144,8 +144,9 @@ void gc_layer_rcu_callback<P>::operator()(threadinfo& ti)
         if (!do_remove || !lp.finish_remove(ti)) {
             lp.n_->unlock();
         }
-        ti.deallocate(this, size(), memtag_masstree_gc);
     }
+    ti.deallocate(this, size(), memtag_masstree_gc);
+    ti.add_nodes_to_gc();
 }
 
 template <typename P>
@@ -172,18 +173,18 @@ bool tcursor<P>::finish_remove(threadinfo& ti) {
     if (perm.size()) {
         return false;
     } else {
-        return remove_leaf(n_, root_, ka_.prefix_string(), ti);
+        return remove_leaf(n_, root_ref_, ka_.prefix_string(), ti);
     }
 }
 
 template <typename P>
-bool tcursor<P>::remove_leaf(leaf_type* leaf, node_type* root,
+bool tcursor<P>::remove_leaf(leaf_type* leaf, node_type** root_ref,
                              Str prefix, threadinfo& ti)
 {
     if (!leaf->prev_) {
         if (!leaf->next_.ptr && !prefix.empty()) {
             // Leaf doesn't hold any keys, not in the highest layer and has no neighbors --> entire layer can be destroyed
-            gc_layer_rcu_callback_ng<P>::make(root, prefix, ti);
+            gc_layer_rcu_callback_ng<P>::make(root_ref, prefix, ti);
         }
         // Leaf has neighbor to the right (next) or leaf in the highest layer. do nothing
         return false;
@@ -211,6 +212,14 @@ bool tcursor<P>::remove_leaf(leaf_type* leaf, node_type* root,
     // Unlink leaf from doubly-linked leaf list
     btree_leaflink<leaf_type>::unlink(leaf);
 
+    // leaf->prev_ != NULL
+    leaf_type *prev = leaf->prev_;
+    if (!prev->prev_ && !prev->next_.ptr && prev->size() == 0 && !prefix.empty() ) {
+        // After removing the leaf, only the most left leaf remains (single leaf). We can remove the layer as the most left leaf
+        //  doesn't hold any keys and layer is not the highest one.
+        gc_layer_rcu_callback_ng<P>::make(root_ref, prefix, ti);
+    }
+
     // Remove leaf from tree, collapse trivial chains, and rewrite
     // ikey bounds.
     ikey_type ikey = leaf->ikey_bound();
diff --git a/masstree_scan.hh b/masstree_scan.hh
index 31ffcbc..f7b0937 100644
--- a/masstree_scan.hh
+++ b/masstree_scan.hh
@@ -306,8 +306,10 @@ int scanstackelt<P>::find_next(H &helper, key_type &ka, leafvalue_type &entry)
         fence();
         entry = n_->lv_[kp];
         entry.prefetch(keylenx);
-        if (n_->keylenx_has_ksuf(keylenx))
+        if (n_->keylenx_has_ksuf(keylenx)) {
             keylen = ka.assign_store_suffix(n_->ksuf(kp));
+            masstree_invariant(keylen < (int)MASSTREE_MAXKEYLEN);
+        }
 
         if (n_->has_changed(v_))
             goto changed;
diff --git a/masstree_split.hh b/masstree_split.hh
index fcf35ee..42b012b 100644
--- a/masstree_split.hh
+++ b/masstree_split.hh
@@ -46,7 +46,7 @@ leaf<P>::ikey_after_insert(const permuter_type& perm, int i,
 
     The split type is 0 if @a ka went into *this, 1 if the @a ka went into
     *@a nr, and 2 for the sequential-order optimization (@a ka went into *@a
-    nr and no other keys were moved). */
+    nr and no other keys were moved). if -1, split failed due to memory issue */
 template <typename P>
 int leaf<P>::split_into(leaf<P>* nr, tcursor<P>* cursor,
                         ikey_type& split_ikey, threadinfo& ti)
@@ -71,7 +71,8 @@ int leaf<P>::split_into(leaf<P>* nr, tcursor<P>* cursor,
     int p = cursor->kx_.i;
     if (p == 0 && !this->prev_) {
         // reverse-sequential optimization
-        mid = 1;
+        // We remove this optimization as it can lead us to empty leaf (In case insertion fails)
+        // mid = 1;
     } else if (p == width && !this->next_.ptr) {
         // sequential optimization
         mid = width;
@@ -100,9 +101,16 @@ int leaf<P>::split_into(leaf<P>* nr, tcursor<P>* cursor,
     typename permuter_type::value_type pv = perml.value_from(mid - (p < mid));
     for (int x = mid; x <= width; ++x) {
         if (x == p) {
-            nr->assign_initialize(x - mid, cursor->ka_, ti);
+            if (!nr->assign_initialize(x - mid, cursor->ka_, ti)) {
+                ti.set_last_error(MT_MERR_SPLIT_INTO_ASSIGN_INITALIZE_1);
+                return -1;
+            }
+
         } else {
-            nr->assign_initialize(x - mid, this, pv & 15, ti);
+            if (!nr->assign_initialize(x - mid, this, pv & 15, ti)) {
+                ti.set_last_error(MT_MERR_SPLIT_INTO_ASSIGN_INITALIZE_2);
+                return -1;
+            }
             pv >>= 4;
         }
     }
@@ -174,6 +182,14 @@ int internode<P>::split_into(internode<P>* nr, int p, ikey_type ka,
     }
 }
 
+template <typename P>
+void tcursor<P>::release_internodes(internode_type * internodes_array[], int start, int end, threadinfo& ti) {
+        for (int i = start; i < end; i++) {
+            masstree_invariant(internodes_array[i]);
+            ti.deallocate(internodes_array[i], sizeof(*internodes_array[i]) /* Being ignored */, memtag_masstree_internode);
+            internodes_array[i] = nullptr;
+        }
+}
 
 template <typename P>
 bool tcursor<P>::make_split(threadinfo& ti)
@@ -191,17 +207,66 @@ bool tcursor<P>::make_split(threadinfo& ti)
         if (kx_.p != 0) {
             n_->permutation_ = perm.value();
             fence();
-            n_->assign(kx_.p, ka_, ti);
+            if (n_->assign(kx_.p, ka_, ti)) {
+                return true;
+            }
+            ti.set_last_error(MT_MERR_MAKE_SPLIT_PERM_EXCHANGE);
+            return false;
+        }
+    }
+
+    bool rc = true;
+
+    // 2 optimizations that can reduce the number of internodes allocations:
+    //   1. In n_ does not have parent, only 1 internode is required (rare case - only on first split)
+    //   2. In case n_'s parent has extra place, and it's height is 1, we dont need internodes at all (common case, but requires early lock of n_'s parent)
+    node_type* l_root = n_;
+
+    while (!l_root->is_root()) {
+        if (n_ != l_root) {
+            l_root->stable_annotated(ti.stable_fence());
+        }
+        l_root = l_root->maybe_parent();
+    }
+
+    // l_root->height_ is the layer real height or higher.
+    uint32_t layer_height = l_root->isleaf() ? 1 : ((internode_type *)l_root)->height_;
+    int reserved_nodes = layer_height + 5; // add 5 extra nodes (extra 5 layers in single b-tree)
+    internode_type * preallocated_internodes[reserved_nodes + 1] = { 0 };
+    int cur_cache_index = 0;
+
+    for (int i = 0; i < reserved_nodes; i++) {
+        preallocated_internodes[i] = (internode_type *)ti.pool_allocate(MAX_MEMTAG_MASSTREE_INTERNODE_ALLOCATION_SIZE,
+                                                                memtag_masstree_internode);
+        if (!preallocated_internodes[i]) {
+            release_internodes(preallocated_internodes, 0, i, ti);
+            ti.set_last_error(MT_MERR_MAKE_SPLIT_PRE_ALLOC);
             return false;
         }
     }
 
     node_type* child = leaf_type::make(n_->ksuf_used_capacity(), n_->phantom_epoch(), ti);
+    if (!child) {
+        release_internodes(preallocated_internodes, 0, reserved_nodes, ti);
+        ti.set_last_error(MT_MERR_MAKE_SPLIT_LEAF_ALLOC);
+        return false;
+    }
     child->assign_version(*n_);
+    child->mark_nonroot();
+    // As n_ is locked, child is locked as well.
     ikey_type xikey[2];
     // Add the new key and spread the keys between the 2 leafs. The new key might be inserted to either one of the leafs. Link to parent will be done later.
     int split_type = n_->split_into(static_cast<leaf_type*>(child),
                                     this, xikey[0], ti);
+
+    if (split_type < 0) {
+        // Split failed due to ksuffix memory allocation error (child is not connected to n_ at this stage)
+        release_internodes(preallocated_internodes, 0, reserved_nodes, ti);
+        // child is not visiable yet, so we can deallocate without rcu
+        ((leaf_type *)child)->deallocate(ti);
+        child = nullptr;
+        return false;
+    }
     unsigned sense = 0;
     node_type* n = n_;
     uint32_t height = 0;
@@ -219,7 +284,17 @@ bool tcursor<P>::make_split(threadinfo& ti)
         }
 
         if (kp < 0 || p->height_ > height + 1) {
-            internode_type *nn = internode_type::make(height + 1, ti);
+            masstree_invariant(preallocated_internodes[cur_cache_index]);
+            internode_type *nn = internode_type::make(height + 1, ti, preallocated_internodes[cur_cache_index++]);
+            if (!nn) {
+              // Should never happen with pre-allocated internodes. bad flow is not handled
+              ti.set_last_error(MT_MERR_MAKE_SPLIT_INTERNODE_ALLOC_NOT_EXPECTED);
+            }
+
+            if (cur_cache_index == reserved_nodes) {
+              // Should never happen with pre-allocated internodes (we should have enough reserved nodes). bad flow is not handled
+              ti.set_last_error(MT_MERR_MAKE_SPLIT_INTERNODE_ALLOC_EMPTY_PRE_ALLOC_NOT_EXPECTED);
+            }
             nn->child_[0] = n;
             nn->assign(0, xikey[sense], child);
             nn->nkeys_ = 1;
@@ -233,11 +308,22 @@ bool tcursor<P>::make_split(threadinfo& ti)
             n->set_parent(nn);
         } else {
             if (p->size() >= p->width) {
-                next_child = internode_type::make(height + 1, ti);
+                masstree_invariant(preallocated_internodes[cur_cache_index]);
+                next_child = internode_type::make(height + 1, ti, preallocated_internodes[cur_cache_index++]);
+                if (!next_child) {
+                  // Should never happen with pre-allocated internodes. bad flow is not handled
+                  ti.set_last_error(MT_MERR_MAKE_SPLIT_INTERNODE_ALLOC_NOT_EXPECTED_2);
+                }
+
+                if (cur_cache_index == reserved_nodes) {
+                  // Should never happen with pre-allocated internodes (we should have enough reserved nodes). bad flow is not handled
+                  ti.set_last_error(MT_MERR_MAKE_SPLIT_INTERNODE_ALLOC_EMPTY_PRE_ALLOC_NOT_EXPECTED_2);
+                }
+
                 next_child->assign_version(*p);
                 next_child->mark_nonroot();
                 kp = p->split_into(next_child, kp, xikey[sense],
-                                   child, xikey[sense ^ 1], split_type);
+                                   child, xikey[sense ^ 1], split_type); // No memory allocation
             }
             if (kp >= 0) {
                 p->shift_up(kp + 1, kp, p->size() - kp);
@@ -259,16 +345,27 @@ bool tcursor<P>::make_split(threadinfo& ti)
             int width = perml.size();
             perml.set_size(width - nr->size());
             // removed item, if any, must be @ perml.size()
+            int perm_size = perml.size();
+            masstree_invariant(perm_size > 0); // Verify that the leaf is not empty
             if (width != nl->width) {
-                perml.exchange(perml.size(), nl->width - 1);
+                perml.exchange(perm_size, nl->width - 1);
             }
             nl->mark_split();
             nl->permutation_ = perml.value();
             // account for split
             if (split_type == 0) {
                 kx_.p = perml.back();
-                nl->assign(kx_.p, ka_, ti);
+
+                // In case the new inserted key should be placed in the origianl leaf (left leaf), memory allocation might be needed for it's ksuffix.
+                //  If assign fails (--> memory allocation failure), the flow will continue, but we mark rc as false to indicate that the insertion failed.
+                //  In this case, the key wont be exposed in finish_insert(), but the leaf split will be completed successfully.
+                if (!nl->assign(kx_.p, ka_, ti)) {
+                  ti.set_last_error(MT_MERR_MAKE_SPLIT_ASSIGN_SUFFIX);
+                  rc = false;
+                }
+#ifndef MOT_OBSOLETE_CODE
                 new_nodes_.emplace_back(nr, nr->full_unlocked_version_value());
+#endif
             } else {
                 kx_.i = kx_.p = kx_.i - perml.size();
                 n_ = nr;
@@ -296,7 +393,10 @@ bool tcursor<P>::make_split(threadinfo& ti)
         }
     }
 
-    return false;
+    // Free unused pre-allocated internodes
+    release_internodes(preallocated_internodes, cur_cache_index, reserved_nodes, ti);
+
+    return rc;
 }
 
 } // namespace Masstree
diff --git a/masstree_struct.hh b/masstree_struct.hh
index 8f121a9..1b5d853 100644
--- a/masstree_struct.hh
+++ b/masstree_struct.hh
@@ -120,9 +120,15 @@ class internode : public node_base<P> {
         : node_base<P>(false), nkeys_(0), height_(height), parent_() {
     }
 
-    static internode<P>* make(uint32_t height, threadinfo& ti) {
-        void* ptr = ti.pool_allocate(MAX_MEMTAG_MASSTREE_INTERNODE_ALLOCATION_SIZE,
+    static internode<P>* make(uint32_t height, threadinfo& ti, void * allocated_internode = nullptr) {
+        void* ptr = allocated_internode ?
+                    allocated_internode :
+                    ti.pool_allocate(MAX_MEMTAG_MASSTREE_INTERNODE_ALLOCATION_SIZE,
                                      memtag_masstree_internode);
+        if (!ptr) {
+            ti.set_last_error(MT_MERR_MAKE_INTERNODE);
+            return nullptr;
+        }
         internode<P>* n = new(ptr) internode<P>(height);
         assert(n);
         if (P::debug_level > 0)
@@ -319,8 +325,12 @@ class leaf : public node_base<P> {
     }
 
     static leaf<P>* make(int ksufsize, phantom_epoch_type phantom_epoch, threadinfo& ti) {
-        size_t sz = iceil(sizeof(leaf<P>) + std::min(ksufsize, 128), 64);
+        size_t sz = MAX_MEMTAG_MASSTREE_LEAF_ALLOCATION_SIZE; // iceil(sizeof(leaf<P>) + std::min(ksufsize, 128), 64);
         void* ptr = ti.pool_allocate(sz, memtag_masstree_leaf);
+        if (!ptr) {
+            ti.set_last_error(MT_MERR_MAKE_LEAF);
+            return nullptr;
+        }
         leaf<P>* n = new(ptr) leaf<P>(sz, phantom_epoch);
         assert(n);
         if (P::debug_level > 0) {
@@ -330,6 +340,10 @@ class leaf : public node_base<P> {
     }
     static leaf<P>* make_root(int ksufsize, leaf<P>* parent, threadinfo& ti) {
         leaf<P>* n = make(ksufsize, parent ? parent->phantom_epoch() : phantom_epoch_type(), ti);
+        if (!n) {
+            ti.set_last_error(MT_MERR_MAKE_ROOT_LEAF);
+            return nullptr;
+        }
         n->next_.ptr = n->prev_ = 0;
         n->ikey0_[0] = 0; // to avoid undefined behavior
         n->make_layer_root();
@@ -413,7 +427,9 @@ class leaf : public node_base<P> {
     }
     Str ksuf(int p, int keylenx) const {
         (void) keylenx;
-        masstree_precondition(keylenx_has_ksuf(keylenx));
+        // keylenx might not be equal to ksuf_keylenx as this operation might be called without holding leaf's lock
+        // We allow it, and expect the caller to validate leaf's version and retry.
+        //masstree_precondition(keylenx_has_ksuf(keylenx));
         return ksuf_ ? ksuf_->get(p) : iksuf_[0].get(p);
     }
     Str ksuf(int p) const {
@@ -429,7 +445,7 @@ class leaf : public node_base<P> {
         return s.len == ka.suffix().len
             && string_slice<uintptr_t>::equals_sloppy(s.s, ka.suffix().s, s.len);
     }
-    // Returns 1 if match & not layer, 0 if no match, <0 if match and layer
+    // Returns 1 if match & not layer, 0 if no match, < 0 if match and layer
     int ksuf_matches(int p, const key_type& ka) const {
         int keylenx = keylenx_[p];
         if (keylenx < ksuf_keylenx)
@@ -520,40 +536,55 @@ class leaf : public node_base<P> {
         modstate_ = modstate_deleted_layer;
     }
 
-    inline void assign(int p, const key_type& ka, threadinfo& ti) {
+    inline bool assign(int p, const key_type& ka, threadinfo& ti) {
         lv_[p] = leafvalue_type::make_empty();
         ikey0_[p] = ka.ikey();
         if (!ka.has_suffix()) {
             keylenx_[p] = ka.length();
         } else {
             keylenx_[p] = ksuf_keylenx;
-            assign_ksuf(p, ka.suffix(), false, ti);
+            if (!assign_ksuf(p, ka.suffix(), false, ti)) {
+                ti.set_last_error(MT_MERR_LEAF_ASSIGN);
+                return false;
+            }
         }
+
+        return true;
     }
-    inline void assign_initialize(int p, const key_type& ka, threadinfo& ti) {
+    inline bool assign_initialize(int p, const key_type& ka, threadinfo& ti) {
         lv_[p] = leafvalue_type::make_empty();
         ikey0_[p] = ka.ikey();
         if (!ka.has_suffix()) {
             keylenx_[p] = ka.length();
         } else {
             keylenx_[p] = ksuf_keylenx;
-            assign_ksuf(p, ka.suffix(), true, ti);
+            if (!assign_ksuf(p, ka.suffix(), true, ti)) {
+                ti.set_last_error(MT_MERR_ASSIGN_INITALIZE_1);
+                return false;
+            }
         }
+
+        return true;
     }
-    inline void assign_initialize(int p, leaf<P>* x, int xp, threadinfo& ti) {
+    inline bool assign_initialize(int p, leaf<P>* x, int xp, threadinfo& ti) {
         lv_[p] = x->lv_[xp];
         ikey0_[p] = x->ikey0_[xp];
         keylenx_[p] = x->keylenx_[xp];
         if (x->has_ksuf(xp)) {
-            assign_ksuf(p, x->ksuf(xp), true, ti);
+            if (!assign_ksuf(p, x->ksuf(xp), true, ti)) {
+                ti.set_last_error(MT_MERR_ASSIGN_INITALIZE_2);
+                return false;
+            }
         }
+
+        return true;
     }
     inline void assign_initialize_for_layer(int p, const key_type& ka) {
         assert(ka.has_suffix());
         ikey0_[p] = ka.ikey();
         keylenx_[p] = layer_keylenx;
     }
-    void assign_ksuf(int p, Str s, bool initializing, threadinfo& ti);
+    bool assign_ksuf(int p, Str s, bool initializing, threadinfo& ti);
 
     inline ikey_type ikey_after_insert(const permuter_type& perm, int i,
                                        const tcursor<P>* cursor) const;
@@ -763,14 +794,14 @@ leaf<P>* leaf<P>::advance_to_key(const key_type& ka, nodeversion_type& v,
     positions [0,p) are ready: keysuffixes in that range are copied. In either
     case, the key at position p is NOT copied; it is assigned to @a s. */
 template <typename P>
-void leaf<P>::assign_ksuf(int p, Str s, bool initializing, threadinfo& ti) {
+bool leaf<P>::assign_ksuf(int p, Str s, bool initializing, threadinfo& ti) {
     if ((ksuf_ && ksuf_->assign(p, s))
         || (extrasize64_ > 0 && iksuf_[0].assign(p, s)))
     {
 #if !(defined(__x86_64__) || defined(__x86__))        
 	fence();
-#endif        
-        return;
+#endif
+        return true;
     }
     external_ksuf_type* oksuf = ksuf_;
 
@@ -796,15 +827,19 @@ void leaf<P>::assign_ksuf(int p, Str s, bool initializing, threadinfo& ti) {
         sz = std::max(sz, oksuf->capacity());
 
     void* ptr = ti.allocate(sz, memtag_masstree_ksuffixes, &sz);
+    if (!ptr) {
+        ti.set_last_error(MT_MERR_ASSIGN_KSUF);
+        return false;
+    }
     external_ksuf_type* nksuf = new(ptr) external_ksuf_type(width, sz);
     for (int i = 0; i < n; ++i) {
         int mp = initializing ? i : perm[i];
         if (mp != p && has_ksuf(mp)) {
-            bool ok = nksuf->assign(mp, ksuf(mp));
+            bool ok = nksuf->assign(mp, ksuf(mp)); // No memory allocation here
             assert(ok); (void) ok;
         }
     }
-    bool ok = nksuf->assign(p, s);
+    bool ok = nksuf->assign(p, s); // No memory allocation here
     assert(ok); (void) ok;
     fence();
 
@@ -824,11 +859,12 @@ void leaf<P>::assign_ksuf(int p, Str s, bool initializing, threadinfo& ti) {
     if (oksuf)
         ti.deallocate_rcu(oksuf, oksuf->capacity(),
                           memtag_masstree_ksuffixes);
+    return true;
 }
 
 template <typename P>
 inline basic_table<P>::basic_table()
-    : root_(0) {
+    : root_(nullptr) {
 }
 
 template <typename P>
diff --git a/masstree_tcursor.hh b/masstree_tcursor.hh
index 2442c9f..755588d 100644
--- a/masstree_tcursor.hh
+++ b/masstree_tcursor.hh
@@ -106,22 +106,23 @@ class tcursor {
     static constexpr int new_nodes_size = 1; // unless we make a new trie newnodes will have at most 1 item
     typedef small_vector<std::pair<leaf_type*, nodeversion_value_type>, new_nodes_size> new_nodes_type;
 
+#ifndef MOT_OBSOLETE_CODE
     tcursor(basic_table<P>& table, Str str)
         : ka_(str), root_(table.fix_root()) {
     }
     tcursor(basic_table<P>& table, const char* s, int len)
         : ka_(s, len), root_(table.fix_root()) {
     }
-    tcursor(basic_table<P>& table, const unsigned char* s, int len)
-        : ka_(reinterpret_cast<const char*>(s), len), root_(table.fix_root()) {
-    }
-    tcursor(node_base<P>* root, const char* s, int len)
-        : ka_(s, len), root_(root) {
-    }
     tcursor(node_base<P>* root, const unsigned char* s, int len)
         : ka_(reinterpret_cast<const char*>(s), len), root_(root) {
     }
-
+#endif
+    tcursor(basic_table<P>& table, const unsigned char* s, int len)
+        : ka_(reinterpret_cast<const char*>(s), len), root_(table.fix_root()), root_ref_(table.root_ref()) {
+    }
+    tcursor(node_base<P>** root_ref, const char* s, int len)
+        : ka_(s, len), root_(*root_ref), root_ref_(root_ref) {
+    }
     inline bool has_value() const {
         return kx_.p >= 0;
     }
@@ -148,13 +149,13 @@ class tcursor {
     inline nodeversion_value_type updated_version_value() const {
         return updated_v_;
     }
-
+#ifndef MOT_OBSOLETE_CODE
     inline const new_nodes_type &new_nodes() const {
         return new_nodes_;
     }
-
+#endif
     inline bool find_locked(threadinfo& ti);
-    inline bool find_insert(threadinfo& ti);
+    inline bool find_insert(threadinfo& ti, bool & found);
 
     inline void finish(int answer, threadinfo& ti);
 
@@ -166,13 +167,16 @@ class tcursor {
     key_type ka_;
     key_indexed_position kx_;
     node_base<P>* root_;
+    node_base<P>** root_ref_;
     int state_;
 
-    leaf_type* original_n_;
+    leaf_type* original_n_ = nullptr;
     nodeversion_value_type original_v_;
     nodeversion_value_type updated_v_;
-    new_nodes_type new_nodes_;
 
+#ifndef MOT_OBSOLETE_CODE
+    new_nodes_type new_nodes_;
+#endif
     inline node_type* reset_retry() {
         ka_.unshift_all();
         return root_;
@@ -180,6 +184,7 @@ class tcursor {
 
     bool make_new_layer(threadinfo& ti);
     bool make_split(threadinfo& ti);
+    void release_internodes(internode_type * internodes_array[], int start, int end, threadinfo& ti);
     friend class leaf<P>;
     inline void finish_insert();
     inline bool finish_remove(threadinfo& ti);
@@ -191,7 +196,7 @@ class tcursor {
      *   If removing a leaf in layer 0, @a prefix is empty.
      *   If removing, for example, the node containing key "01234567ABCDEF" in the layer-1 tree
      *   rooted at "01234567", then @a prefix should equal "01234567". */
-    static bool remove_leaf(leaf_type* leaf, node_type* root,
+    static bool remove_leaf(leaf_type* leaf, node_type** root_ref,
                             Str prefix, threadinfo& ti);
 
     bool gc_layer(threadinfo& ti);
diff --git a/mot_masstree_config.hpp b/mot_masstree_config.hpp
index bec2ec8..fe3930b 100644
--- a/mot_masstree_config.hpp
+++ b/mot_masstree_config.hpp
@@ -25,6 +25,9 @@
 #ifndef MOT_MASSTREE_CONFIG_HPP
 #define MOT_MASSTREE_CONFIG_HPP
 
+// Ignore masstree code which is obsolete in MOT
+#define MOT_OBSOLETE_CODE 1
+
 #define MOT_HAVE_CXX_TEMPLATE_ALIAS 1
 #define MOT_HAVE_INT64_T_IS_LONG 1
 #define MOT_HAVE_SIZE_T_IS_UNSIGNED_LONG 1
@@ -58,7 +61,7 @@
 #define MOT_SIZEOF_LONG_LONG 8
 #define MOT_SIZEOF_SHORT 2
 #define MOT_WORDS_BIGENDIAN_SET 1
-
+/*
 #define masstree_invariant(x, ...) \
     do {                           \
     } while (0)
@@ -66,6 +69,12 @@
 #define masstree_precondition(x, ...) \
     do {                              \
     } while (0)
+*/
+
+#define masstree_invariant(x, ...) assert(x)
+#define masstree_precondition(x, ...) assert(x)
+
+
 
 #ifndef invariant
 #define invariant masstree_invariant
-- 
1.8.3.1