diff --git a/be/src/gutil/CMakeLists.txt b/be/src/gutil/CMakeLists.txt
index 2c0d283619..1cc33438f1 100644
--- a/be/src/gutil/CMakeLists.txt
+++ b/be/src/gutil/CMakeLists.txt
@@ -31,6 +31,7 @@ SET(SOURCE_FILES
         hash/jenkins.cc
         int128.cc
         once.cc
+        ref_counted.cc
         spinlock_internal.cc
         stringprintf.cc
         strings/ascii_ctype.cc
@@ -46,6 +47,8 @@ SET(SOURCE_FILES
         strings/substitute.cc
         strings/util.cc
         strtoint.cc
+        sysinfo.cc
+        threading/thread_collision_warner.cc
         utf/rune.c
         cpu.cc)
 
diff --git a/be/src/gutil/arm_instruction_set_select.h b/be/src/gutil/arm_instruction_set_select.h
new file mode 100644
index 0000000000..87bc183358
--- /dev/null
+++ b/be/src/gutil/arm_instruction_set_select.h
@@ -0,0 +1,52 @@
+// Copyright 2011 Google Inc.
+// All Rights Reserved.
+//
+//
+// Generalizes the plethora of ARM flavors available to an easier to manage set
+// Defs reference is at https://wiki.edubuntu.org/ARM/Thumb2PortingHowto
+
+#ifndef ARM_INSTRUCTION_SET_SELECT_H_
+#define ARM_INSTRUCTION_SET_SELECT_H_
+
+#if defined(__ARM_ARCH_7__) || \
+    defined(__ARM_ARCH_7R__) || \
+    defined(__ARM_ARCH_7A__)
+# define ARMV7 1
+#endif
+
+#if defined(ARMV7) || \
+    defined(__ARM_ARCH_6__) || \
+    defined(__ARM_ARCH_6J__) || \
+    defined(__ARM_ARCH_6K__) || \
+    defined(__ARM_ARCH_6Z__) || \
+    defined(__ARM_ARCH_6T2__) || \
+    defined(__ARM_ARCH_6ZK__)
+# define ARMV6 1
+#endif
+
+#if defined(ARMV6) || \
+    defined(__ARM_ARCH_5T__) || \
+    defined(__ARM_ARCH_5E__) || \
+    defined(__ARM_ARCH_5TE__) || \
+    defined(__ARM_ARCH_5TEJ__)
+# define ARMV5 1
+#endif
+
+#if defined(ARMV5) || \
+    defined(__ARM_ARCH_4__) || \
+    defined(__ARM_ARCH_4T__)
+# define ARMV4 1
+#endif
+
+#if defined(ARMV4) || \
+    defined(__ARM_ARCH_3__) || \
+    defined(__ARM_ARCH_3M__)
+# define ARMV3 1
+#endif
+
+#if defined(ARMV3) || \
+    defined(__ARM_ARCH_2__)
+# define ARMV2 1
+#endif
+
+#endif  // ARM_INSTRUCTION_SET_SELECT_H_
diff --git a/be/src/gutil/atomic_refcount.h b/be/src/gutil/atomic_refcount.h
new file mode 100644
index 0000000000..2b5cb36099
--- /dev/null
+++ b/be/src/gutil/atomic_refcount.h
@@ -0,0 +1,153 @@
+#ifndef BASE_ATOMIC_REFCOUNT_H_
+#define BASE_ATOMIC_REFCOUNT_H_
+// Copyright 2008 Google Inc.
+// All rights reserved.
+
+// Atomic increment and decrement for reference counting.
+// For atomic operations on statistics counters and sequence numbers,
+// see atomic_stats_counter.h and atomic_sequence_num.h respectively.
+
+// Some clients use atomic operations for reference counting.
+// you use one of them:
+//         util/refcount/reference_counted.h
+//         util/gtl/refcounted_ptr.h
+//         util/gtl/shared_ptr.h
+// Alternatively, use a Mutex to maintain your reference count.
+// If you really must build your own reference counts with atomic operations,
+// use the following routines in the way suggested by this example:
+//    AtomicWord ref_count_;  // remember to initialize this to 0
+//    ...
+//    void Ref() {
+//      base::RefCountInc(&this->ref_count_);
+//    }
+//    void Unref() {
+//      if (!base::RefCountDec(&this->ref_count_)) {
+//        delete this;
+//      }
+//    }
+// Using these routines (rather than the ones in atomicops.h) will provide the
+// correct semantics; in particular, the memory ordering needed to make
+// reference counting work will be guaranteed.
+// You need not declare the reference count word "volatile".  After
+// initialization you should use the word only via the routines below; the
+// "volatile" in the signatures below is for backwards compatibility.
+//
+// If you need to do something very different from this, use a Mutex.
+
+#include <glog/logging.h>
+
+#include "gutil/atomicops.h"
+#include "gutil/integral_types.h"
+#include "gutil/logging-inl.h"
+
+namespace base {
+
+// These calls are available for both Atomic32, and AtomicWord types,
+// and also for base::subtle::Atomic64 if available on the platform.
+
+// Normally, clients are expected to use RefCountInc/RefCountDec.
+// In rare cases, it may be necessary to adjust the reference count by
+// more than 1, in which case they may use RefCountIncN/RefCountDecN.
+
+// Increment a reference count by "increment", which must exceed 0.
+inline void RefCountIncN(volatile Atomic32 *ptr, Atomic32 increment) {
+  DCHECK_GT(increment, 0);
+  base::subtle::NoBarrier_AtomicIncrement(ptr, increment);
+}
+
+// Decrement a reference count by "decrement", which must exceed 0,
+// and return whether the result is non-zero.
+// Insert barriers to ensure that state written before the reference count
+// became zero will be visible to a thread that has just made the count zero.
+inline bool RefCountDecN(volatile Atomic32 *ptr, Atomic32 decrement) {
+  DCHECK_GT(decrement, 0);
+  bool res = base::subtle::Barrier_AtomicIncrement(ptr, -decrement) != 0;
+  return res;
+}
+
+// Increment a reference count by 1.
+inline void RefCountInc(volatile Atomic32 *ptr) {
+  base::RefCountIncN(ptr, 1);
+}
+
+// Decrement a reference count by 1 and return whether the result is non-zero.
+// Insert barriers to ensure that state written before the reference count
+// became zero will be visible to a thread that has just made the count zero.
+inline bool RefCountDec(volatile Atomic32 *ptr) {
+  return base::RefCountDecN(ptr, 1);
+}
+
+// Return whether the reference count is one.
+// If the reference count is used in the conventional way, a
+// refrerence count of 1 implies that the current thread owns the 
+// reference and no other thread shares it.  
+// This call performs the test for a referenece count of one, and
+// performs the memory barrier needed for the owning thread
+// to act on the object, knowing that it has exclusive access to the
+// object.
+inline bool RefCountIsOne(const volatile Atomic32 *ptr) {
+  return base::subtle::Acquire_Load(ptr) == 1;
+}
+
+// Return whether the reference count is zero.  With conventional object
+// referencing counting, the object will be destroyed, so the reference count
+// should never be zero.  Hence this is generally used for a debug check.
+inline bool RefCountIsZero(const volatile Atomic32 *ptr) {
+  return subtle::Acquire_Load(ptr) == 0;
+}
+
+#if BASE_HAS_ATOMIC64
+// Implementations for Atomic64, if available.
+inline void RefCountIncN(volatile base::subtle::Atomic64 *ptr,
+                         base::subtle::Atomic64 increment) {
+  DCHECK_GT(increment, 0);
+  base::subtle::NoBarrier_AtomicIncrement(ptr, increment);
+}
+inline bool RefCountDecN(volatile base::subtle::Atomic64 *ptr,
+                         base::subtle::Atomic64 decrement) {
+  DCHECK_GT(decrement, 0);
+  return base::subtle::Barrier_AtomicIncrement(ptr, -decrement) != 0;
+}
+inline void RefCountInc(volatile base::subtle::Atomic64 *ptr) {
+  base::RefCountIncN(ptr, 1);
+}
+inline bool RefCountDec(volatile base::subtle::Atomic64 *ptr) {
+  return base::RefCountDecN(ptr, 1);
+}
+inline bool RefCountIsOne(const volatile base::subtle::Atomic64 *ptr) {
+  return base::subtle::Acquire_Load(ptr) == 1;
+}
+inline bool RefCountIsZero(const volatile base::subtle::Atomic64 *ptr) {
+  return base::subtle::Acquire_Load(ptr) == 0;
+}
+#endif
+
+#ifdef AtomicWordCastType
+// Implementations for AtomicWord, if it's a different type from the above.
+inline void RefCountIncN(volatile AtomicWord *ptr, AtomicWord increment) {
+  base::RefCountIncN(
+      reinterpret_cast<volatile AtomicWordCastType *>(ptr), increment);
+}
+inline bool RefCountDecN(volatile AtomicWord *ptr, AtomicWord decrement) {
+  return base::RefCountDecN(
+      reinterpret_cast<volatile AtomicWordCastType *>(ptr), decrement);
+}
+inline void RefCountInc(volatile AtomicWord *ptr) {
+  base::RefCountIncN(ptr, 1);
+}
+inline bool RefCountDec(volatile AtomicWord *ptr) {
+  return base::RefCountDecN(ptr, 1);
+}
+inline bool RefCountIsOne(const volatile AtomicWord *ptr) {
+  return base::subtle::Acquire_Load(
+      reinterpret_cast<const volatile AtomicWordCastType *>(ptr)) == 1;
+}
+inline bool RefCountIsZero(const volatile AtomicWord *ptr) {
+  return base::subtle::Acquire_Load(
+      reinterpret_cast<const volatile AtomicWordCastType *>(ptr)) == 0;
+}
+#endif
+
+} // namespace base
+
+#endif  // BASE_ATOMIC_REFCOUNT_H_
diff --git a/be/src/gutil/cycleclock-inl.h b/be/src/gutil/cycleclock-inl.h
new file mode 100644
index 0000000000..063b397a36
--- /dev/null
+++ b/be/src/gutil/cycleclock-inl.h
@@ -0,0 +1,215 @@
+// Copyright (C) 1999-2007 Google, Inc.
+//
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// All rights reserved.
+// Extracted from base/timer.h by jrvb
+
+// The implementation of CycleClock::Now()
+// See cycleclock.h
+//
+// IWYU pragma: private, include "base/cycleclock.h"
+
+// NOTE: only i386 and x86_64 have been well tested.
+// PPC, sparc, alpha, and ia64 are based on
+//    http://peter.kuscsik.com/wordpress/?p=14
+// with modifications by m3b.  See also
+//    https://setisvn.ssl.berkeley.edu/svn/lib/fftw-3.0.1/kernel/cycle.h
+
+#ifndef GUTIL_CYCLECLOCK_INL_H_
+#define GUTIL_CYCLECLOCK_INL_H_
+
+#include <sys/time.h>
+
+#include "gutil/port.h"
+#include "gutil/arm_instruction_set_select.h"
+
+// Please do not nest #if directives.  Keep one section, and one #if per
+// platform.
+
+// For historical reasons, the frequency on some platforms is scaled to be
+// close to the platform's core clock frequency.  This is not guaranteed by the
+// interface, and may change in future implementations.
+
+// ----------------------------------------------------------------
+#if defined(__APPLE__)
+#include <mach/mach_time.h>
+inline int64 CycleClock::Now() {
+  // this goes at the top because we need ALL Macs, regardless of
+  // architecture, to return the number of "mach time units" that
+  // have passed since startup.  See sysinfo.cc where
+  // InitializeSystemInfo() sets the supposed cpu clock frequency of
+  // macs to the number of mach time units per second, not actual
+  // CPU clock frequency (which can change in the face of CPU
+  // frequency scaling).  Also note that when the Mac sleeps, this
+  // counter pauses; it does not continue counting, nor does it
+  // reset to zero.
+  return mach_absolute_time();
+}
+
+// ----------------------------------------------------------------
+#elif defined(__i386__)
+inline int64 CycleClock::Now() {
+  int64 ret;
+  __asm__ volatile("rdtsc" : "=A" (ret));
+  return ret;
+}
+
+// ----------------------------------------------------------------
+#elif defined(__x86_64__) || defined(__amd64__)
+inline int64 CycleClock::Now() {
+  uint64 low, high;
+  __asm__ volatile("rdtsc" : "=a" (low), "=d" (high));
+  return (high << 32) | low;
+}
+
+// ----------------------------------------------------------------
+#elif defined(__powerpc__) || defined(__ppc__)
+#define SPR_TB 268
+#define SPR_TBU 269
+inline int64 CycleClock::Now() {
+  uint64 time_base_value;
+  if (sizeof(void*) == 8) {
+    // On PowerPC64, time base can be read with one SPR read.
+    asm volatile("mfspr %0, %1" : "=r" (time_base_value) : "i"(SPR_TB));
+  } else {
+    uint32 tbl, tbu0, tbu1;
+    asm volatile (" mfspr %0, %3\n"
+                  " mfspr %1, %4\n"
+                  " mfspr %2, %3\n" :
+                  "=r"(tbu0), "=r"(tbl), "=r"(tbu1) :
+                  "i"(SPR_TBU), "i"(SPR_TB));
+    // If there is a carry into the upper half, it is okay to return
+    // (tbu1, 0) since it must be between the 2 TBU reads.
+    tbl &= -static_cast<uint32>(tbu0 == tbu1);
+    // high 32 bits in tbu1; low 32 bits in tbl  (tbu0 is garbage)
+    time_base_value =
+        (static_cast<uint64>(tbu1) << 32) | static_cast<uint64>(tbl);
+  }
+  return static_cast<int64>(time_base_value);
+}
+
+// ----------------------------------------------------------------
+#elif defined(__sparc__)
+inline int64 CycleClock::Now() {
+  int64 tick;
+  asm(".byte 0x83, 0x41, 0x00, 0x00");
+  asm("mov   %%g1, %0" : "=r" (tick));
+  return tick;
+}
+
+// ----------------------------------------------------------------
+#elif defined(__ia64__)
+inline int64 CycleClock::Now() {
+  int64 itc;
+  asm("mov %0 = ar.itc" : "=r" (itc));
+  return itc;
+}
+
+// ----------------------------------------------------------------
+#elif defined(_MSC_VER) && defined(_M_IX86)
+inline int64 CycleClock::Now() {
+  // Older MSVC compilers (like 7.x) don't seem to support the
+  // __rdtsc intrinsic properly, so I prefer to use _asm instead
+  // when I know it will work.  Otherwise, I'll use __rdtsc and hope
+  // the code is being compiled with a non-ancient compiler.
+  _asm rdtsc
+}
+
+// ----------------------------------------------------------------
+#elif defined(_MSC_VER)
+// For MSVC, we want to use '_asm rdtsc' when possible (since it works
+// with even ancient MSVC compilers), and when not possible the
+// __rdtsc intrinsic, declared in <intrin.h>.  Unfortunately, in some
+// environments, <windows.h> and <intrin.h> have conflicting
+// declarations of some other intrinsics, breaking compilation.
+// Therefore, we simply declare __rdtsc ourselves. See also
+// http://connect.microsoft.com/VisualStudio/feedback/details/262047
+extern "C" uint64 __rdtsc();
+#pragma intrinsic(__rdtsc)
+inline int64 CycleClock::Now() {
+  return __rdtsc();
+}
+
+// ----------------------------------------------------------------
+#elif defined(ARMV6)  // V6 is the earliest arm that has a standard cyclecount
+#include "gutil/sysinfo.h"
+inline int64 CycleClock::Now() {
+  uint32 pmccntr;
+  uint32 pmuseren;
+  uint32 pmcntenset;
+  // Read the user mode perf monitor counter access permissions.
+  asm volatile("mrc p15, 0, %0, c9, c14, 0" : "=r" (pmuseren));
+  if (pmuseren & 1) {  // Allows reading perfmon counters for user mode code.
+    asm volatile("mrc p15, 0, %0, c9, c12, 1" : "=r" (pmcntenset));
+    if (pmcntenset & 0x80000000ul) {  // Is it counting?
+      asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r" (pmccntr));
+      // The counter is set up to count every 64th cycle
+      return static_cast<int64>(pmccntr) * 64;  // Should optimize to << 6
+    }
+  }
+  struct timeval tv;
+  gettimeofday(&tv, NULL);
+  return static_cast<int64>((tv.tv_sec + tv.tv_usec * 0.000001)
+                            * CyclesPerSecond());
+}
+
+// ----------------------------------------------------------------
+#elif defined(ARMV3)
+#include "gutil/sysinfo.h"   // for CyclesPerSecond()
+inline int64 CycleClock::Now() {
+  struct timeval tv;
+  gettimeofday(&tv, NULL);
+  return static_cast<int64>((tv.tv_sec + tv.tv_usec * 0.000001)
+                            * CyclesPerSecond());
+}
+
+// ----------------------------------------------------------------
+#elif defined(__mips__)
+#include "gutil/sysinfo.h"
+inline int64 CycleClock::Now() {
+  // mips apparently only allows rdtsc for superusers, so we fall
+  // back to gettimeofday.  It's possible clock_gettime would be better.
+  struct timeval tv;
+  gettimeofday(&tv, NULL);
+  return static_cast<int64>((tv.tv_sec + tv.tv_usec * 0.000001)
+                            * CyclesPerSecond());
+}
+
+// ----------------------------------------------------------------
+#elif defined(__aarch64__)
+#include "gutil/sysinfo.h"
+inline int64 CycleClock::Now() {
+  // System timer of ARMv8 runs at a different frequency than the CPU's.
+  // The frequency is fixed, typically in the range 1-50MHz.  It can be
+  // read at CNTFRQ special register.  We assume the OS has set up
+  // the virtual timer properly.
+  int64_t virtual_timer_value;
+  asm volatile("mrs %0, cntvct_el0" : "=r"(virtual_timer_value));
+  return virtual_timer_value;
+}
+// ----------------------------------------------------------------
+#else
+// The soft failover to a generic implementation is automatic only for some
+// platforms.  For other platforms the developer is expected to make an attempt
+// to create a fast implementation and use generic version if nothing better is
+// available.
+#error You need to define CycleTimer for your O/S and CPU
+#endif
+
+#endif  // GUTIL_CYCLECLOCK_INL_H_
diff --git a/be/src/gutil/endian.h b/be/src/gutil/endian.h
index 6b8a0bc772..f957df74aa 100644
--- a/be/src/gutil/endian.h
+++ b/be/src/gutil/endian.h
@@ -1,10 +1,32 @@
-// Copyright 2005 Google, Inc
+// Copyright 2005 Google Inc.
+//
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// ---
+//
 //
 // Utility functions that depend on bytesex. We define htonll and ntohll,
 // as well as "Google" versions of all the standards: ghtonl, ghtons, and
 // so on. These functions do exactly the same as their standard variants,
 // but don't require including the dangerous netinet/in.h.
-
+//
+// Buffer routines will copy to and from buffers without causing
+// a bus error when the architecture requires differnt byte alignments
 #ifndef UTIL_ENDIAN_ENDIAN_H_
 #define UTIL_ENDIAN_ENDIAN_H_
 
@@ -20,7 +42,7 @@ inline uint64 gbswap_64(uint64 host_int) {
   if (__builtin_constant_p(host_int)) {
     return __bswap_constant_64(host_int);
   } else {
-    register uint64 result;
+    uint64 result;
     __asm__("bswap %0" : "=r" (result) : "0" (host_int));
     return result;
   }
@@ -33,7 +55,7 @@ inline uint64 gbswap_64(uint64 host_int) {
 }
 
 inline unsigned __int128 gbswap_128(unsigned __int128 host_int) {
-    return static_cast<unsigned __int128>(bswap_64(static_cast<uint64>(host_int >> 64))) |
+  return static_cast<unsigned __int128>(bswap_64(static_cast<uint64>(host_int >> 64))) |
         (static_cast<unsigned __int128>(bswap_64(static_cast<uint64>(host_int))) << 64);
 }
 
@@ -58,24 +80,32 @@ inline uint64 ghtonll(uint64 x) { return gbswap_64(x); }
 
 #elif defined IS_BIG_ENDIAN
 
-// These definitions are a lot simpler on big-endian machines
-#define ghtons(x) (x)
-#define ghtonl(x) (x)
-#define ghtonll(x) (x)
+// These definitions are simpler on big-endian machines
+// These are functions instead of macros to avoid self-assignment warnings
+// on calls such as "i = ghtnol(i);".  This also provides type checking.
+inline uint16 ghtons(uint16 x) { return x; }
+inline uint32 ghtonl(uint32 x) { return x; }
+inline uint64 ghtonll(uint64 x) { return x; }
 
 #else
-#error "Unsupported bytesex: Either IS_BIG_ENDIAN or IS_LITTLE_ENDIAN must be defined"
+#error "Unsupported bytesex: Either IS_BIG_ENDIAN or IS_LITTLE_ENDIAN must be defined"  // NOLINT
 #endif  // bytesex
 
-// Convert to little-endian storage, opposite of network format.
-// Convert x from host to little endian: x = LittleEndian.FromHost(x);
-// convert x from little endian to host: x = LittleEndian.ToHost(x);
+// ntoh* and hton* are the same thing for any size and bytesex,
+// since the function is an involution, i.e., its own inverse.
+#define gntohl(x) ghtonl(x)
+#define gntohs(x) ghtons(x)
+#define gntohll(x) ghtonll(x)
+#if !defined(__APPLE__)
+// This one is safe to take as it's an extension
+#define htonll(x) ghtonll(x)
+#define ntohll(x) htonll(x)
+#endif
+
+// Utilities to convert numbers between the current hosts's native byte
+// order and little-endian byte order
 //
-//  Store values into unaligned memory converting to little endian order:
-//    LittleEndian.Store16(p, x);
-//
-//  Load unaligned values stored in little endian coverting to host order:
-//    x = LittleEndian.Load16(p);
+// Load/Store methods are alignment safe
 class LittleEndian {
  public:
   // Conversion functions.
@@ -90,6 +120,9 @@ class LittleEndian {
   static uint64 FromHost64(uint64 x) { return x; }
   static uint64 ToHost64(uint64 x) { return x; }
 
+  static unsigned __int128 FromHost128(unsigned __int128 x) { return x; }
+  static unsigned __int128 ToHost128(unsigned __int128 x) { return x; }
+
   static bool IsLittleEndian() { return true; }
 
 #elif defined IS_BIG_ENDIAN
@@ -183,162 +216,169 @@ class LittleEndian {
       return uint128(Load64VariableLength(p, len));
     } else {
       return uint128(
-          Load64VariableLength(static_cast<const char *>(p) + 8, len - 8),
-          Load64(p));
+        Load64VariableLength(static_cast<const char *>(p) + 8, len - 8),
+        Load64(p));
     }
   }
+
+  // Load & Store in machine's word size.
+  static uword_t LoadUnsignedWord(const void *p) {
+    if (sizeof(uword_t) == 8)
+      return Load64(p);
+    else
+      return Load32(p);
+  }
+
+  static void StoreUnsignedWord(void *p, uword_t v) {
+    if (sizeof(v) == 8)
+      Store64(p, v);
+    else
+      Store32(p, v);
+  }
 };
 
-
-// This one is safe to take as it's an extension
-#define htonll(x) ghtonll(x)
-
-// ntoh* and hton* are the same thing for any size and bytesex,
-// since the function is an involution, i.e., its own inverse.
-#define gntohl(x) ghtonl(x)
-#define gntohs(x) ghtons(x)
-#define gntohll(x) ghtonll(x)
-#define ntohll(x) htonll(x)
-
 // Utilities to convert numbers between the current hosts's native byte
 // order and big-endian byte order (same as network byte order)
 //
 // Load/Store methods are alignment safe
 class BigEndian {
-public:
+ public:
 #ifdef IS_LITTLE_ENDIAN
 
-    static uint16 FromHost16(uint16 x) { return bswap_16(x); }
-    static uint16 ToHost16(uint16 x) { return bswap_16(x); }
+  static uint16 FromHost16(uint16 x) { return bswap_16(x); }
+  static uint16 ToHost16(uint16 x) { return bswap_16(x); }
 
-    static uint32 FromHost24(uint32 x) { return bswap_24(x); }
-    static uint32 ToHost24(uint32 x) { return bswap_24(x); }
+  static uint32 FromHost24(uint32 x) { return bswap_24(x); }
+  static uint32 ToHost24(uint32 x) { return bswap_24(x); }
 
-    static uint32 FromHost32(uint32 x) { return bswap_32(x); }
-    static uint32 ToHost32(uint32 x) { return bswap_32(x); }
+  static uint32 FromHost32(uint32 x) { return bswap_32(x); }
+  static uint32 ToHost32(uint32 x) { return bswap_32(x); }
 
-    static uint64 FromHost64(uint64 x) { return gbswap_64(x); }
-    static uint64 ToHost64(uint64 x) { return gbswap_64(x); }
+  static uint64 FromHost64(uint64 x) { return gbswap_64(x); }
+  static uint64 ToHost64(uint64 x) { return gbswap_64(x); }
 
-    static unsigned __int128 FromHost128(unsigned __int128 x) { return gbswap_128(x); }
-    static unsigned __int128 ToHost128(unsigned __int128 x) { return gbswap_128(x); }
+  static unsigned __int128 FromHost128(unsigned __int128 x) { return gbswap_128(x); }
+  static unsigned __int128 ToHost128(unsigned __int128 x) { return gbswap_128(x); }
 
-    static bool IsLittleEndian() { return true; }
+  static bool IsLittleEndian() { return true; }
 
 #elif defined IS_BIG_ENDIAN
 
-    static uint16 FromHost16(uint16 x) { return x; }
-    static uint16 ToHost16(uint16 x) { return x; }
+  static uint16 FromHost16(uint16 x) { return x; }
+  static uint16 ToHost16(uint16 x) { return x; }
 
     static uint32 FromHost24(uint32 x) { return x; }
     static uint32 ToHost24(uint32 x) { return x; }
 
-    static uint32 FromHost32(uint32 x) { return x; }
-    static uint32 ToHost32(uint32 x) { return x; }
+  static uint32 FromHost32(uint32 x) { return x; }
+  static uint32 ToHost32(uint32 x) { return x; }
 
-    static uint64 FromHost64(uint64 x) { return x; }
-    static uint64 ToHost64(uint64 x) { return x; }
+  static uint64 FromHost64(uint64 x) { return x; }
+  static uint64 ToHost64(uint64 x) { return x; }
 
-    static uint128 FromHost128(uint128 x) { return x; }
-    static uint128 ToHost128(uint128 x) { return x; }
+  static uint128 FromHost128(uint128 x) { return x; }
+  static uint128 ToHost128(uint128 x) { return x; }
 
-    static bool IsLittleEndian() { return false; }
+  static bool IsLittleEndian() { return false; }
 
 #endif /* ENDIAN */
-    // Functions to do unaligned loads and stores in little-endian order.
-    static uint16 Load16(const void *p) {
-        return ToHost16(UNALIGNED_LOAD16(p));
-    }
+  // Functions to do unaligned loads and stores in little-endian order.
+  static uint16 Load16(const void *p) {
+    return ToHost16(UNALIGNED_LOAD16(p));
+  }
 
-    static void Store16(void *p, uint16 v) {
-        UNALIGNED_STORE16(p, FromHost16(v));
-    }
+  static void Store16(void *p, uint16 v) {
+    UNALIGNED_STORE16(p, FromHost16(v));
+  }
 
-    static uint32 Load32(const void *p) {
-        return ToHost32(UNALIGNED_LOAD32(p));
-    }
+  static uint32 Load32(const void *p) {
+    return ToHost32(UNALIGNED_LOAD32(p));
+  }
 
-    static void Store32(void *p, uint32 v) {
-        UNALIGNED_STORE32(p, FromHost32(v));
-    }
+  static void Store32(void *p, uint32 v) {
+    UNALIGNED_STORE32(p, FromHost32(v));
+  }
 
-    static uint64 Load64(const void *p) {
-        return ToHost64(UNALIGNED_LOAD64(p));
-    }
+  static uint64 Load64(const void *p) {
+    return ToHost64(UNALIGNED_LOAD64(p));
+  }
 
-    // Build a uint64 from 1-8 bytes.
-    // 8 * len least significant bits are loaded from the memory with
-    // BigEndian order. The 64 - 8 * len most significant bits are
-    // set all to 0.
-    // In latex-friendly words, this function returns:
-    //     $\sum_{i=0}^{len-1} p[i] 256^{i}$, where p[i] is unsigned.
-    //
-    // This function is equivalent with:
-    // uint64 val = 0;
-    // memcpy(&val, p, len);
-    // return ToHost64(val);
-    // TODO(user): write a small benchmark and benchmark the speed
-    // of a memcpy based approach.
-    //
-    // For speed reasons this function does not work for len == 0.
-    // The caller needs to guarantee that 1 <= len <= 8.
-    static uint64 Load64VariableLength(const void * const p, int len) {
-        assert(len >= 1 && len <= 8);
-        uint64 val = Load64(p);
-        uint64 mask = 0;
-        --len;
-        do {
-            mask = (mask << 8) | 0xff;
-            // (--len >= 0) is about 10 % faster than (len--) in some benchmarks.
-        } while (--len >= 0);
-        return val & mask;
-    }
+  // Build a uint64 from 1-8 bytes.
+  // 8 * len least significant bits are loaded from the memory with
+  // BigEndian order. The 64 - 8 * len most significant bits are
+  // set all to 0.
+  // In latex-friendly words, this function returns:
+  //     $\sum_{i=0}^{len-1} p[i] 256^{i}$, where p[i] is unsigned.
+  //
+  // This function is equivalent with:
+  // uint64 val = 0;
+  // memcpy(&val, p, len);
+  // return ToHost64(val);
+  // TODO(user): write a small benchmark and benchmark the speed
+  // of a memcpy based approach.
+  //
+  // For speed reasons this function does not work for len == 0.
+  // The caller needs to guarantee that 1 <= len <= 8.
+  static uint64 Load64VariableLength(const void * const p, int len) {
+    assert(len >= 1 && len <= 8);
+    uint64 val = Load64(p);
+    uint64 mask = 0;
+    --len;
+    do {
+      mask = (mask << 8) | 0xff;
+      // (--len >= 0) is about 10 % faster than (len--) in some benchmarks.
+    } while (--len >= 0);
+    return val & mask;
+  }
 
-    static void Store64(void *p, uint64 v) {
-        UNALIGNED_STORE64(p, FromHost64(v));
-    }
+  static void Store64(void *p, uint64 v) {
+    UNALIGNED_STORE64(p, FromHost64(v));
+  }
 
-    static uint128 Load128(const void *p) {
-        return uint128(
-            ToHost64(UNALIGNED_LOAD64(p)),
-            ToHost64(UNALIGNED_LOAD64(reinterpret_cast<const uint64 *>(p) + 1)));
-    }
+  static uint128 Load128(const void *p) {
+    return uint128(
+        ToHost64(UNALIGNED_LOAD64(p)),
+        ToHost64(UNALIGNED_LOAD64(reinterpret_cast<const uint64 *>(p) + 1)));
+  }
 
-    static void Store128(void *p, const uint128 v) {
-        UNALIGNED_STORE64(p, FromHost64(Uint128High64(v)));
-        UNALIGNED_STORE64(reinterpret_cast<uint64 *>(p) + 1,
-                          FromHost64(Uint128Low64(v)));
-    }
+  static void Store128(void *p, const uint128 v) {
+    UNALIGNED_STORE64(p, FromHost64(Uint128High64(v)));
+    UNALIGNED_STORE64(reinterpret_cast<uint64 *>(p) + 1,
+                FromHost64(Uint128Low64(v)));
+  }
 
-    // Build a uint128 from 1-16 bytes.
-    // 8 * len least significant bits are loaded from the memory with
-    // BigEndian order. The 128 - 8 * len most significant bits are
-    // set all to 0.
-    static uint128 Load128VariableLength(const void *p, int len) {
-        if (len <= 8) {
-            return uint128(Load64VariableLength(static_cast<const char *>(p)+8,
-                                                len));
-        } else {
-            return uint128(
-                Load64VariableLength(p, len-8),
-                Load64(static_cast<const char *>(p)+8));
-        }
+  // Build a uint128 from 1-16 bytes.
+  // 8 * len least significant bits are loaded from the memory with
+  // BigEndian order. The 128 - 8 * len most significant bits are
+  // set all to 0.
+  static uint128 Load128VariableLength(const void *p, int len) {
+    if (len <= 8) {
+      return uint128(Load64VariableLength(static_cast<const char *>(p)+8,
+                        len));
+    } else {
+      return uint128(
+        Load64VariableLength(p, len-8),
+        Load64(static_cast<const char *>(p)+8));
     }
+  }
 
-    // Load & Store in machine's word size.
-    static uword_t LoadUnsignedWord(const void *p) {
-        if (sizeof(uword_t) == 8)
-            return Load64(p);
-        else
-            return Load32(p);
-    }
+  // Load & Store in machine's word size.
+  static uword_t LoadUnsignedWord(const void *p) {
+    if (sizeof(uword_t) == 8)
+      return Load64(p);
+    else
+      return Load32(p);
+  }
 
-    static void StoreUnsignedWord(void *p, uword_t v) {
-        if (sizeof(uword_t) == 8)
-            Store64(p, v);
-        else
-            Store32(p, v);
-    }
+  static void StoreUnsignedWord(void *p, uword_t v) {
+    if (sizeof(uword_t) == 8)
+      Store64(p, v);
+    else
+      Store32(p, v);
+  }
 };  // BigEndian
 
+// Network byte order is big-endian
+typedef BigEndian NetworkByteOrder;
+
 #endif  // UTIL_ENDIAN_ENDIAN_H_
diff --git a/be/src/gutil/map-util.h b/be/src/gutil/map-util.h
new file mode 100644
index 0000000000..c42672fd5f
--- /dev/null
+++ b/be/src/gutil/map-util.h
@@ -0,0 +1,944 @@
+// Copyright 2005 Google Inc.
+//
+// #status: RECOMMENDED
+// #category: maps
+// #summary: Utility functions for use with map-like containers.
+//
+// This file provides utility functions for use with STL map-like data
+// structures, such as std::map and hash_map. Some functions will also work with
+// sets, such as ContainsKey().
+//
+// The main functions in this file fall into the following categories:
+//
+// - Find*()
+// - Contains*()
+// - Insert*()
+// - Lookup*()
+//
+// These functions often have "...OrDie" or "...OrDieNoPrint" variants. These
+// variants will crash the process with a CHECK() failure on error, including
+// the offending key/data in the log message. The NoPrint variants will not
+// include the key/data in the log output under the assumption that it's not a
+// printable type.
+//
+// Most functions are fairly self explanatory from their names, with the
+// exception of Find*() vs Lookup*(). The Find functions typically use the map's
+// .find() member function to locate and return the map's value type. The
+// Lookup*() functions typically use the map's .insert() (yes, insert) member
+// function to insert the given value if necessary and returns (usually a
+// reference to) the map's value type for the found item.
+//
+// See the per-function comments for specifics.
+//
+// There are also a handful of functions for doing other miscellaneous things.
+//
+// A note on terminology:
+//
+// Map-like containers are collections of pairs. Like all STL containers they
+// contain a few standard typedefs identifying the types of data they contain.
+// Given the following map declaration:
+//
+//   map<string, int> my_map;
+//
+// the notable typedefs would be as follows:
+//
+//   - key_type    -- string
+//   - value_type  -- pair<const string, int>
+//   - mapped_type -- int
+//
+// Note that the map above contains two types of "values": the key-value pairs
+// themselves (value_type) and the values within the key-value pairs
+// (mapped_type). A value_type consists of a key_type and a mapped_type.
+//
+// The documentation below is written for programmers thinking in terms of keys
+// and the (mapped_type) values associated with a given key.  For example, the
+// statement
+//
+//   my_map["foo"] = 3;
+//
+// has a key of "foo" (type: string) with a value of 3 (type: int).
+//
+
+#ifndef UTIL_GTL_MAP_UTIL_H_
+#define UTIL_GTL_MAP_UTIL_H_
+
+#include <stddef.h>
+
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include <glog/logging.h>
+
+#include "gutil/logging-inl.h"
+
+//
+// Find*()
+//
+
+// Returns a const reference to the value associated with the given key if it
+// exists. Crashes otherwise.
+//
+// This is intended as a replacement for operator[] as an rvalue (for reading)
+// when the key is guaranteed to exist.
+//
+// operator[] for lookup is discouraged for several reasons:
+//  * It has a side-effect of inserting missing keys
+//  * It is not thread-safe (even when it is not inserting, it can still
+//      choose to resize the underlying storage)
+//  * It invalidates iterators (when it chooses to resize)
+//  * It default constructs a value object even if it doesn't need to
+//
+// This version assumes the key is printable, and includes it in the fatal log
+// message.
+template <class Collection>
+const typename Collection::mapped_type&
+FindOrDie(const Collection& collection,
+          const typename Collection::key_type& key) {
+  auto it = collection.find(key);
+  CHECK(it != collection.end()) << "Map key not found: " << key;
+  return it->second;
+}
+
+// Same as above, but returns a non-const reference.
+template <class Collection>
+typename Collection::mapped_type&
+FindOrDie(Collection& collection,  // NOLINT
+          const typename Collection::key_type& key) {
+  auto it = collection.find(key);
+  CHECK(it != collection.end()) << "Map key not found: " << key;
+  return it->second;
+}
+
+// Same as FindOrDie above, but doesn't log the key on failure.
+template <class Collection>
+const typename Collection::mapped_type&
+FindOrDieNoPrint(const Collection& collection,
+                 const typename Collection::key_type& key) {
+  typename Collection::const_iterator it = collection.find(key);
+  CHECK(it != collection.end()) << "Map key not found";
+  return it->second;
+}
+
+// Same as above, but returns a non-const reference.
+template <class Collection>
+typename Collection::mapped_type&
+FindOrDieNoPrint(Collection& collection,  // NOLINT
+                 const typename Collection::key_type& key) {
+  typename Collection::iterator it = collection.find(key);
+  CHECK(it != collection.end()) << "Map key not found";
+  return it->second;
+}
+
+// Returns a const reference to the value associated with the given key if it
+// exists, otherwise a const reference to the provided default value is
+// returned.
+//
+// WARNING: If a temporary object is passed as the default "value," this
+// function will return a reference to that temporary object, which will be
+// destroyed by the end of the statement. Specifically, if you have a map with
+// string values, and you pass a char* as the default "value," either use the
+// returned value immediately or store it in a string (not string&). Details:
+template <class Collection>
+const typename Collection::mapped_type&
+FindWithDefault(const Collection& collection,
+                const typename Collection::key_type& key,
+                const typename Collection::mapped_type& value) {
+  auto it = collection.find(key);
+  if (it == collection.end()) {
+    return value;
+  }
+  return it->second;
+}
+
+// Returns a pointer to the const value associated with the given key if it
+// exists, or NULL otherwise.
+template <class Collection>
+const typename Collection::mapped_type*
+FindOrNull(const Collection& collection,
+           const typename Collection::key_type& key) {
+  auto it = collection.find(key);
+  if (it == collection.end()) {
+    return 0;
+  }
+  return &it->second;
+}
+
+// Same as above but returns a pointer to the non-const value.
+template <class Collection>
+typename Collection::mapped_type*
+FindOrNull(Collection& collection,  // NOLINT
+           const typename Collection::key_type& key) {
+  auto it = collection.find(key);
+  if (it == collection.end()) {
+    return 0;
+  }
+  return &it->second;
+}
+
+// Returns a pointer to the const value associated with the greatest key
+// that's less than or equal to the given key, or NULL if no such key exists.
+template <class Collection>
+const typename Collection::mapped_type*
+FindFloorOrNull(const Collection& collection,
+                const typename Collection::key_type& key) {
+  auto it = collection.upper_bound(key);
+  if (it == collection.begin()) {
+    return 0;
+  }
+  return &(--it)->second;
+}
+
+// Same as above but returns a pointer to the non-const value.
+template <class Collection>
+typename Collection::mapped_type*
+FindFloorOrNull(Collection& collection,  // NOLINT
+                const typename Collection::key_type& key) {
+  auto it = collection.upper_bound(key);
+  if (it == collection.begin()) {
+    return 0;
+  }
+  return &(--it)->second;
+}
+
+// Returns a const-reference to the value associated with the greatest key
+// that's less than or equal to the given key, or crashes if it does not exist.
+template <class Collection>
+const typename Collection::mapped_type&
+FindFloorOrDie(const Collection& collection,
+               const typename Collection::key_type& key) {
+  auto it = collection.upper_bound(key);
+  CHECK(it != collection.begin());
+  return (--it)->second;
+}
+
+// Same as above, but returns a non-const reference.
+template <class Collection>
+typename Collection::mapped_type&
+FindFloorOrDie(Collection& collection,
+               const typename Collection::key_type& key) {
+  auto it = collection.upper_bound(key);
+  CHECK(it != collection.begin());
+  return (--it)->second;
+}
+
+// Returns the pointer value associated with the given key. If none is found,
+// NULL is returned. The function is designed to be used with a map of keys to
+// pointers.
+//
+// This function does not distinguish between a missing key and a key mapped
+// to a NULL value.
+template <class Collection>
+typename Collection::mapped_type
+FindPtrOrNull(const Collection& collection,
+              const typename Collection::key_type& key) {
+  auto it = collection.find(key);
+  if (it == collection.end()) {
+    return typename Collection::mapped_type(0);
+  }
+  return it->second;
+}
+
+// Same as above, except takes non-const reference to collection.
+//
+// This function is needed for containers that propagate constness to the
+// pointee, such as boost::ptr_map.
+template <class Collection>
+typename Collection::mapped_type
+FindPtrOrNull(Collection& collection,  // NOLINT
+              const typename Collection::key_type& key) {
+  auto it = collection.find(key);
+  if (it == collection.end()) {
+    return typename Collection::mapped_type(0);
+  }
+  return it->second;
+}
+
+// FindPtrOrNull like function for maps whose value is a smart pointer like shared_ptr or
+// unique_ptr.
+// Returns the raw pointer contained in the smart pointer for the first found key, if it exists,
+// or null if it doesn't.
+template <class Collection>
+typename Collection::mapped_type::element_type*
+FindPointeeOrNull(const Collection& collection,  // NOLINT,
+                  const typename Collection::key_type& key) {
+  auto it = collection.find(key);
+  if (it == collection.end()) {
+    return nullptr;
+  }
+  return it->second.get();
+}
+
+// Finds the value associated with the given key and copies it to *value (if not
+// NULL). Returns false if the key was not found, true otherwise.
+template <class Collection, class Key, class Value>
+bool FindCopy(const Collection& collection,
+              const Key& key,
+              Value* const value) {
+  auto it = collection.find(key);
+  if (it == collection.end()) {
+    return false;
+  }
+  if (value) {
+    *value = it->second;
+  }
+  return true;
+}
+
+//
+// Contains*()
+//
+
+// Returns true iff the given collection contains the given key.
+template <class Collection, class Key>
+bool ContainsKey(const Collection& collection, const Key& key) {
+  return collection.find(key) != collection.end();
+}
+
+// Returns true iff the given collection contains the given key-value pair.
+template <class Collection, class Key, class Value>
+bool ContainsKeyValuePair(const Collection& collection,
+                          const Key& key,
+                          const Value& value) {
+  typedef typename Collection::const_iterator const_iterator;
+  std::pair<const_iterator, const_iterator> range = collection.equal_range(key);
+  for (const_iterator it = range.first; it != range.second; ++it) {
+    if (it->second == value) {
+      return true;
+    }
+  }
+  return false;
+}
+
+//
+// Insert*()
+//
+
+// Inserts the given key-value pair into the collection. Returns true if the
+// given key didn't previously exist. If the given key already existed in the
+// map, its value is changed to the given "value" and false is returned.
+template <class Collection>
+bool InsertOrUpdate(Collection* const collection,
+                    const typename Collection::value_type& vt) {
+  std::pair<typename Collection::iterator, bool> ret = collection->insert(vt);
+  if (!ret.second) {
+    // update
+    ret.first->second = vt.second;
+    return false;
+  }
+  return true;
+}
+
+// Same as above, except that the key and value are passed separately.
+template <class Collection>
+bool InsertOrUpdate(Collection* const collection,
+                    const typename Collection::key_type& key,
+                    const typename Collection::mapped_type& value) {
+  return InsertOrUpdate(
+      collection, typename Collection::value_type(key, value));
+}
+
+// Inserts/updates all the key-value pairs from the range defined by the
+// iterators "first" and "last" into the given collection.
+template <class Collection, class InputIterator>
+void InsertOrUpdateMany(Collection* const collection,
+                        InputIterator first, InputIterator last) {
+  for (; first != last; ++first) {
+    InsertOrUpdate(collection, *first);
+  }
+}
+
+// Change the value associated with a particular key in a map or hash_map
+// of the form map<Key, Value*> which owns the objects pointed to by the
+// value pointers.  If there was an existing value for the key, it is deleted.
+// True indicates an insert took place, false indicates an update + delete.
+template <class Collection>
+bool InsertAndDeleteExisting(
+    Collection* const collection,
+    const typename Collection::key_type& key,
+    const typename Collection::mapped_type& value) {
+  std::pair<typename Collection::iterator, bool> ret =
+      collection->insert(typename Collection::value_type(key, value));
+  if (!ret.second) {
+    delete ret.first->second;
+    ret.first->second = value;
+    return false;
+  }
+  return true;
+}
+
+// Inserts the given key and value into the given collection iff the given key
+// did NOT already exist in the collection. If the key previously existed in the
+// collection, the value is not changed. Returns true if the key-value pair was
+// inserted; returns false if the key was already present.
+template <class Collection>
+bool InsertIfNotPresent(Collection* const collection,
+                        const typename Collection::value_type& vt) {
+  return collection->insert(vt).second;
+}
+
+// Same as above except the key and value are passed separately.
+template <class Collection>
+bool InsertIfNotPresent(
+    Collection* const collection,
+    const typename Collection::key_type& key,
+    const typename Collection::mapped_type& value) {
+  return InsertIfNotPresent(
+      collection, typename Collection::value_type(key, value));
+}
+
+// Same as above except dies if the key already exists in the collection.
+template <class Collection>
+void InsertOrDie(Collection* const collection,
+                 const typename Collection::value_type& value) {
+  CHECK(InsertIfNotPresent(collection, value)) << "duplicate value: " << value;
+}
+
+// Same as above except doesn't log the value on error.
+template <class Collection>
+void InsertOrDieNoPrint(Collection* const collection,
+                        const typename Collection::value_type& value) {
+  CHECK(InsertIfNotPresent(collection, value)) << "duplicate value.";
+}
+
+// Inserts the key-value pair into the collection. Dies if key was already
+// present.
+template <class Collection>
+void InsertOrDie(Collection* const collection,
+                 const typename Collection::key_type& key,
+                 const typename Collection::mapped_type& data) {
+  CHECK(InsertIfNotPresent(collection, key, data))
+      << "duplicate key: " << key;
+}
+
+// Same as above except deson't log the key on error.
+template <class Collection>
+void InsertOrDieNoPrint(
+    Collection* const collection,
+    const typename Collection::key_type& key,
+    const typename Collection::mapped_type& data) {
+  CHECK(InsertIfNotPresent(collection, key, data)) << "duplicate key.";
+}
+
+// Inserts a new key and default-initialized value. Dies if the key was already
+// present. Returns a reference to the value. Example usage:
+//
+// map<int, SomeProto> m;
+// SomeProto& proto = InsertKeyOrDie(&m, 3);
+// proto.set_field("foo");
+template <class Collection>
+typename Collection::mapped_type& InsertKeyOrDie(
+    Collection* const collection,
+    const typename Collection::key_type& key) {
+  typedef typename Collection::value_type value_type;
+  std::pair<typename Collection::iterator, bool> res =
+      collection->insert(value_type(key, typename Collection::mapped_type()));
+  CHECK(res.second) << "duplicate key: " << key;
+  return res.first->second;
+}
+
+//
+// Emplace*()
+//
+template <class Collection, class... Args>
+bool EmplaceIfNotPresent(Collection* const collection,
+                         Args&&... args) {
+  return collection->emplace(std::forward<Args>(args)...).second;
+}
+
+// Emplaces the given key-value pair into the collection. Returns true if the
+// given key didn't previously exist. If the given key already existed in the
+// map, its value is changed to the given "value" and false is returned.
+template <class Collection>
+bool EmplaceOrUpdate(Collection* const collection,
+                     const typename Collection::key_type& key,
+                     typename Collection::mapped_type&& value) {
+  typedef typename Collection::mapped_type mapped_type;
+  auto it = collection->find(key);
+  if (it == collection->end()) {
+    collection->emplace(key, std::forward<mapped_type>(value));
+    return true;
+  }
+  it->second = std::forward<mapped_type>(value);
+  return false;
+}
+
+template <class Collection, class... Args>
+void EmplaceOrDie(Collection* const collection,
+                  Args&&... args) {
+  CHECK(EmplaceIfNotPresent(collection, std::forward<Args>(args)...))
+      << "duplicate value";
+}
+
+//
+// Lookup*()
+//
+
+// Looks up a given key and value pair in a collection and inserts the key-value
+// pair if it's not already present. Returns a reference to the value associated
+// with the key.
+template <class Collection>
+typename Collection::mapped_type&
+LookupOrInsert(Collection* const collection,
+               const typename Collection::value_type& vt) {
+  return collection->insert(vt).first->second;
+}
+
+// Same as above except the key-value are passed separately.
+template <class Collection>
+typename Collection::mapped_type&
+LookupOrInsert(Collection* const collection,
+               const typename Collection::key_type& key,
+               const typename Collection::mapped_type& value) {
+  return LookupOrInsert(
+      collection, typename Collection::value_type(key, value));
+}
+
+// It's similar to LookupOrInsert() but uses the emplace and r-value mechanics
+// to achieve the desired results. The constructor of the new element is called
+// with exactly the same arguments as supplied to emplace, forwarded via
+// std::forward<Args>(args). The element may be constructed even if there
+// already is an element with the same key in the container, in which case the
+// newly constructed element will be destroyed immediately.
+// For details, see
+//   https://en.cppreference.com/w/cpp/container/map/emplace
+//   https://en.cppreference.com/w/cpp/container/unordered_map/emplace
+template <class Collection, class... Args>
+typename Collection::mapped_type&
+LookupOrEmplace(Collection* const collection, Args&&... args) {
+  return collection->emplace(std::forward<Args>(args)...).first->second;
+}
+
+// Counts the number of equivalent elements in the given "sequence", and stores
+// the results in "count_map" with element as the key and count as the value.
+//
+// Example:
+//   vector<string> v = {"a", "b", "c", "a", "b"};
+//   map<string, int> m;
+//   AddTokenCounts(v, 1, &m);
+//   assert(m["a"] == 2);
+//   assert(m["b"] == 2);
+//   assert(m["c"] == 1);
+template <typename Sequence, typename Collection>
+void AddTokenCounts(
+    const Sequence& sequence,
+    const typename Collection::mapped_type& increment,
+    Collection* const count_map) {
+  for (typename Sequence::const_iterator it = sequence.begin();
+       it != sequence.end(); ++it) {
+    typename Collection::mapped_type& value =
+        LookupOrInsert(count_map, *it,
+                       typename Collection::mapped_type());
+    value += increment;
+  }
+}
+
+// Helpers for LookupOrInsertNew(), needed to create a new value type when the
+// type itself is a pointer, i.e., these extract the actual type from a pointer.
+template <class T>
+void MapUtilAssignNewDefaultInstance(T** location) {
+  *location = new T();
+}
+
+template <class T, class Arg>
+void MapUtilAssignNewInstance(T** location, const Arg &arg) {
+  *location = new T(arg);
+}
+
+// Returns a reference to the value associated with key. If not found, a value
+// is default constructed on the heap and added to the map.
+//
+// This function is useful for containers of the form map<Key, Value*>, where
+// inserting a new key, value pair involves constructing a new heap-allocated
+// Value, and storing a pointer to that in the collection.
+template <class Collection>
+typename Collection::mapped_type&
+LookupOrInsertNew(Collection* const collection,
+                  const typename Collection::key_type& key) {
+  std::pair<typename Collection::iterator, bool> ret =
+      collection->insert(
+          typename Collection::value_type(key,
+              static_cast<typename Collection::mapped_type>(NULL)));
+  if (ret.second) {
+    // This helper is needed to 'extract' the Value type from the type of the
+    // container value, which is (Value*).
+    MapUtilAssignNewDefaultInstance(&(ret.first->second));
+  }
+  return ret.first->second;
+}
+
+// Same as above but constructs the value using the single-argument constructor
+// and the given "arg".
+template <class Collection, class Arg>
+typename Collection::mapped_type&
+LookupOrInsertNew(Collection* const collection,
+                  const typename Collection::key_type& key,
+                  const Arg& arg) {
+  std::pair<typename Collection::iterator, bool> ret =
+      collection->insert(
+          typename Collection::value_type(
+              key,
+              static_cast<typename Collection::mapped_type>(NULL)));
+  if (ret.second) {
+    // This helper is needed to 'extract' the Value type from the type of the
+    // container value, which is (Value*).
+    MapUtilAssignNewInstance(&(ret.first->second), arg);
+  }
+  return ret.first->second;
+}
+
+// Lookup of linked/shared pointers is used in two scenarios:
+//
+// Use LookupOrInsertSharedPtr if the container does not own the elements
+// for their whole lifetime. This is typically the case when a reader allows
+// parallel updates to the container. In this case a Mutex only needs to lock
+// container operations, but all element operations must be performed on the
+// shared pointer. Finding an element must be performed using FindPtr*() and
+// cannot be done with FindLinkedPtr*() even though it compiles.
+
+// Lookup a key in a map or hash_map whose values are shared_ptrs.  If it is
+// missing, set collection[key].reset(new Value::element_type). Unlike
+// LookupOrInsertNewLinkedPtr, this function returns the shared_ptr instead of
+// the raw pointer. Value::element_type must be default constructable.
+template <class Collection>
+typename Collection::mapped_type&
+LookupOrInsertNewSharedPtr(
+    Collection* const collection,
+    const typename Collection::key_type& key) {
+  typedef typename Collection::mapped_type SharedPtr;
+  typedef typename Collection::mapped_type::element_type Element;
+  std::pair<typename Collection::iterator, bool> ret =
+      collection->insert(typename Collection::value_type(key, SharedPtr()));
+  if (ret.second) {
+    ret.first->second.reset(new Element());
+  }
+  return ret.first->second;
+}
+
+// A variant of LookupOrInsertNewSharedPtr where the value is constructed using
+// a single-parameter constructor.  Note: the constructor argument is computed
+// even if it will not be used, so only values cheap to compute should be passed
+// here.  On the other hand it does not matter how expensive the construction of
+// the actual stored value is, as that only occurs if necessary.
+template <class Collection, class Arg>
+typename Collection::mapped_type&
+LookupOrInsertNewSharedPtr(
+    Collection* const collection,
+    const typename Collection::key_type& key,
+    const Arg& arg) {
+  typedef typename Collection::mapped_type SharedPtr;
+  typedef typename Collection::mapped_type::element_type Element;
+  std::pair<typename Collection::iterator, bool> ret =
+      collection->insert(typename Collection::value_type(key, SharedPtr()));
+  if (ret.second) {
+    ret.first->second.reset(new Element(arg));
+  }
+  return ret.first->second;
+}
+
+//
+// Misc Utility Functions
+//
+
+// Updates the value associated with the given key. If the key was not already
+// present, then the key-value pair are inserted and "previous" is unchanged. If
+// the key was already present, the value is updated and "*previous" will
+// contain a copy of the old value.
+//
+// InsertOrReturnExisting has complementary behavior that returns the
+// address of an already existing value, rather than updating it.
+template <class Collection>
+bool UpdateReturnCopy(Collection* const collection,
+                      const typename Collection::key_type& key,
+                      const typename Collection::mapped_type& value,
+                      typename Collection::mapped_type* previous) {
+  std::pair<typename Collection::iterator, bool> ret =
+      collection->insert(typename Collection::value_type(key, value));
+  if (!ret.second) {
+    // update
+    if (previous) {
+      *previous = ret.first->second;
+    }
+    ret.first->second = value;
+    return true;
+  }
+  return false;
+}
+
+// Same as above except that the key and value are passed as a pair.
+template <class Collection>
+bool UpdateReturnCopy(Collection* const collection,
+                      const typename Collection::value_type& vt,
+                      typename Collection::mapped_type* previous) {
+  std::pair<typename Collection::iterator, bool> ret =
+    collection->insert(vt);
+  if (!ret.second) {
+    // update
+    if (previous) {
+      *previous = ret.first->second;
+    }
+    ret.first->second = vt.second;
+    return true;
+  }
+  return false;
+}
+
+// Tries to insert the given key-value pair into the collection. Returns NULL if
+// the insert succeeds. Otherwise, returns a pointer to the existing value.
+//
+// This complements UpdateReturnCopy in that it allows to update only after
+// verifying the old value and still insert quickly without having to look up
+// twice. Unlike UpdateReturnCopy this also does not come with the issue of an
+// undefined previous* in case new data was inserted.
+template <class Collection>
+typename Collection::mapped_type* const
+InsertOrReturnExisting(Collection* const collection,
+                       const typename Collection::value_type& vt) {
+  std::pair<typename Collection::iterator, bool> ret = collection->insert(vt);
+  if (ret.second) {
+    return NULL;  // Inserted, no existing previous value.
+  } else {
+    return &ret.first->second;  // Return address of already existing value.
+  }
+}
+
+// Same as above, except for explicit key and data.
+template <class Collection>
+typename Collection::mapped_type* const
+InsertOrReturnExisting(
+    Collection* const collection,
+    const typename Collection::key_type& key,
+    const typename Collection::mapped_type& data) {
+  return InsertOrReturnExisting(collection,
+                                typename Collection::value_type(key, data));
+}
+
+// Saves the reverse mapping into reverse. Key/value pairs are inserted in the
+// order the iterator returns them.
+template <class Collection, class ReverseCollection>
+void ReverseMap(const Collection& collection,
+                ReverseCollection* const reverse) {
+  CHECK(reverse != NULL);
+  for (typename Collection::const_iterator it = collection.begin();
+       it != collection.end();
+       ++it) {
+    InsertOrUpdate(reverse, it->second, it->first);
+  }
+}
+
+// Erases the collection item identified by the given key, and returns the value
+// associated with that key. It is assumed that the value (i.e., the
+// mapped_type) is a pointer. Returns NULL if the key was not found in the
+// collection.
+//
+// Examples:
+//   map<string, MyType*> my_map;
+//
+// One line cleanup:
+//     delete EraseKeyReturnValuePtr(&my_map, "abc");
+//
+// Use returned value:
+//     gscoped_ptr<MyType> value_ptr(EraseKeyReturnValuePtr(&my_map, "abc"));
+//     if (value_ptr.get())
+//       value_ptr->DoSomething();
+//
+// Note: if 'collection' is a multimap, this will only erase and return the
+// first value.
+template <class Collection>
+typename Collection::mapped_type EraseKeyReturnValuePtr(
+    Collection* const collection,
+    const typename Collection::key_type& key) {
+  auto it = collection->find(key);
+  if (it == collection->end()) {
+    return typename Collection::mapped_type();
+  }
+  typename Collection::mapped_type v = std::move(it->second);
+  collection->erase(it);
+  return v;
+}
+
+// Inserts all the keys from map_container into key_container, which must
+// support insert(MapContainer::key_type).
+//
+// Note: any initial contents of the key_container are not cleared.
+template <class MapContainer, class KeyContainer>
+void InsertKeysFromMap(const MapContainer& map_container,
+                       KeyContainer* key_container) {
+  CHECK(key_container != NULL);
+  for (typename MapContainer::const_iterator it = map_container.begin();
+       it != map_container.end(); ++it) {
+    key_container->insert(it->first);
+  }
+}
+
+// Appends all the keys from map_container into key_container, which must
+// support push_back(MapContainer::key_type).
+//
+// Note: any initial contents of the key_container are not cleared.
+template <class MapContainer, class KeyContainer>
+void AppendKeysFromMap(const MapContainer& map_container,
+                       KeyContainer* key_container) {
+  CHECK(key_container != NULL);
+  for (typename MapContainer::const_iterator it = map_container.begin();
+       it != map_container.end(); ++it) {
+    key_container->push_back(it->first);
+  }
+}
+
+// A more specialized overload of AppendKeysFromMap to optimize reallocations
+// for the common case in which we're appending keys to a vector and hence can
+// (and sometimes should) call reserve() first.
+//
+// (It would be possible to play SFINAE games to call reserve() for any
+// container that supports it, but this seems to get us 99% of what we need
+// without the complexity of a SFINAE-based solution.)
+template <class MapContainer, class KeyType>
+void AppendKeysFromMap(const MapContainer& map_container,
+                       std::vector<KeyType>* key_container) {
+  CHECK(key_container != NULL);
+  // We now have the opportunity to call reserve(). Calling reserve() every
+  // time is a bad idea for some use cases: libstdc++'s implementation of
+  // vector<>::reserve() resizes the vector's backing store to exactly the
+  // given size (unless it's already at least that big). Because of this,
+  // the use case that involves appending a lot of small maps (total size
+  // N) one by one to a vector would be O(N^2). But never calling reserve()
+  // loses the opportunity to improve the use case of adding from a large
+  // map to an empty vector (this improves performance by up to 33%). A
+  // number of heuristics are possible; see the discussion in
+  // cl/34081696. Here we use the simplest one.
+  if (key_container->empty()) {
+    key_container->reserve(map_container.size());
+  }
+  for (typename MapContainer::const_iterator it = map_container.begin();
+       it != map_container.end(); ++it) {
+    key_container->push_back(it->first);
+  }
+}
+
+// Inserts all the values from map_container into value_container, which must
+// support push_back(MapContainer::mapped_type).
+//
+// Note: any initial contents of the value_container are not cleared.
+template <class MapContainer, class ValueContainer>
+void AppendValuesFromMap(const MapContainer& map_container,
+                         ValueContainer* value_container) {
+  CHECK(value_container != NULL);
+  for (typename MapContainer::const_iterator it = map_container.begin();
+       it != map_container.end(); ++it) {
+    value_container->push_back(it->second);
+  }
+}
+
+template <class MapContainer, class ValueContainer>
+void EmplaceValuesFromMap(MapContainer&& map_container,
+                          ValueContainer* value_container) {
+  CHECK(value_container != nullptr);
+  // See AppendKeysFromMap for why this is done.
+  if (value_container->empty()) {
+    value_container->reserve(map_container.size());
+  }
+  for (auto&& entry : map_container) {
+    value_container->emplace_back(std::move(entry.second));
+  }
+}
+
+// A more specialized overload of AppendValuesFromMap to optimize reallocations
+// for the common case in which we're appending values to a vector and hence
+// can (and sometimes should) call reserve() first.
+//
+// (It would be possible to play SFINAE games to call reserve() for any
+// container that supports it, but this seems to get us 99% of what we need
+// without the complexity of a SFINAE-based solution.)
+template <class MapContainer, class ValueType>
+void AppendValuesFromMap(const MapContainer& map_container,
+                         std::vector<ValueType>* value_container) {
+  EmplaceValuesFromMap(map_container, value_container);
+}
+
+// Compute and insert new value if it's absent from the map. Return a pair with a reference to the
+// value and a bool indicating whether it was absent at first.
+//
+// This inspired on a similar java construct (url split in two lines):
+// https://docs.oracle.com/javase/8/docs/api/java/util/concurrent/ConcurrentHashMap.html
+// #computeIfAbsent-K-java.util.function.Function
+//
+// It takes a reference to the key and a lambda function. If the key exists in the map, returns
+// a pair with a pointer to the current value and 'false'. If the key does not exist in the map,
+// it uses the lambda function to create a value, inserts it into the map, and returns a pair with
+// a pointer to the new value and 'true'.
+//
+// Example usage:
+//
+// auto result = ComputeIfAbsentReturnAbsense(&my_collection,
+//                                            my_key,
+//                                            [] { return new_value; });
+// MyValue* const value = result.first;
+// if (result.second) ....
+//
+// The ComputePair* variants expect a lambda that creates a pair<k, v>. This
+// can be useful if the key is a StringPiece pointing to external state to
+// avoid excess memory for the keys, while being safer in multi-threaded
+// contexts, e.g. in case the key goes out of scope before the container does.
+//
+// Example usage:
+//
+// map<StringPiece, int, GoodFastHash<StringPiece>> string_to_idx;
+// vector<unique_ptr<StringPB>> pbs;
+// auto result = ComputePairIfAbsentReturnAbsense(&string_to_idx, my_key,
+//     [&]() {
+//       unique_ptr<StringPB> s = new StringPB();
+//       s->set_string(my_key);
+//       int idx = pbs.size();
+//       pbs.emplace_back(s.release());
+//       return make_pair(StringPiece(pbs.back()->string()), idx);
+//     });
+template <class MapContainer, typename Function>
+std::pair<typename MapContainer::mapped_type* const, bool>
+ComputePairIfAbsentReturnAbsense(MapContainer* container,
+                                 const typename MapContainer::key_type& key,
+                                 Function compute_pair_func) {
+  typename MapContainer::iterator iter = container->find(key);
+  bool new_value = iter == container->end();
+  if (new_value) {
+    auto p = compute_pair_func();
+    std::pair<typename MapContainer::iterator, bool> result =
+        container->emplace(std::move(p.first), std::move(p.second));
+    DCHECK(result.second) << "duplicate key: " << key;
+    iter = result.first;
+  }
+  return std::make_pair(&iter->second, new_value);
+}
+template <class MapContainer, typename Function>
+std::pair<typename MapContainer::mapped_type* const, bool>
+ComputeIfAbsentReturnAbsense(MapContainer* container,
+                             const typename MapContainer::key_type& key,
+                             Function compute_func) {
+  return ComputePairIfAbsentReturnAbsense(container, key, [&key, &compute_func] {
+    return std::make_pair(key, compute_func());
+  });
+};
+
+// Like the above but doesn't return a pair, just returns a pointer to the value.
+// Example usage:
+//
+// MyValue* const value = ComputeIfAbsent(&my_collection,
+//                                        my_key,
+//                                        [] { return new_value; });
+//
+template <class MapContainer, typename Function>
+typename MapContainer::mapped_type* const
+ComputeIfAbsent(MapContainer* container,
+                const typename MapContainer::key_type& key,
+                Function compute_func) {
+  return ComputeIfAbsentReturnAbsense(container, key, compute_func).first;
+};
+
+template <class MapContainer, typename Function>
+typename MapContainer::mapped_type* const
+ComputePairIfAbsent(MapContainer* container,
+                    const typename MapContainer::key_type& key,
+                    Function compute_pair_func) {
+  return ComputePairIfAbsentReturnAbsense<MapContainer, Function>(container, key, compute_pair_func).first;
+};
+
+#endif  // UTIL_GTL_MAP_UTIL_H_
diff --git a/be/src/gutil/ref_counted.cc b/be/src/gutil/ref_counted.cc
new file mode 100644
index 0000000000..280d9df25e
--- /dev/null
+++ b/be/src/gutil/ref_counted.cc
@@ -0,0 +1,95 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "gutil/ref_counted.h"
+
+#include <common/logging.h>
+#include "gutil/atomic_refcount.h"
+
+namespace doris {
+
+namespace subtle {
+
+RefCountedBase::RefCountedBase()
+    : ref_count_(0)
+#ifndef NDEBUG
+    , in_dtor_(false)
+#endif
+    {
+}
+
+RefCountedBase::~RefCountedBase() {
+#ifndef NDEBUG
+  DCHECK(in_dtor_) << "RefCounted object deleted without calling Release()";
+#endif
+}
+
+void RefCountedBase::AddRef() const {
+  // TODO(maruel): Add back once it doesn't assert 500 times/sec.
+  // Current thread books the critical section "AddRelease" without release it.
+  // DFAKE_SCOPED_LOCK_THREAD_LOCKED(add_release_);
+#ifndef NDEBUG
+  DCHECK(!in_dtor_);
+#endif
+  ++ref_count_;
+}
+
+bool RefCountedBase::Release() const {
+  // TODO(maruel): Add back once it doesn't assert 500 times/sec.
+  // Current thread books the critical section "AddRelease" without release it.
+  // DFAKE_SCOPED_LOCK_THREAD_LOCKED(add_release_);
+#ifndef NDEBUG
+  DCHECK(!in_dtor_);
+#endif
+  if (--ref_count_ == 0) {
+#ifndef NDEBUG
+    in_dtor_ = true;
+#endif
+    return true;
+  }
+  return false;
+}
+
+bool RefCountedThreadSafeBase::HasOneRef() const {
+  return base::RefCountIsOne(
+      &const_cast<RefCountedThreadSafeBase*>(this)->ref_count_);
+}
+
+RefCountedThreadSafeBase::RefCountedThreadSafeBase() : ref_count_(0) {
+#ifndef NDEBUG
+  in_dtor_ = false;
+#endif
+}
+
+RefCountedThreadSafeBase::~RefCountedThreadSafeBase() {
+#ifndef NDEBUG
+  DCHECK(in_dtor_) << "RefCountedThreadSafe object deleted without "
+                      "calling Release()";
+#endif
+}
+
+void RefCountedThreadSafeBase::AddRef() const {
+#ifndef NDEBUG
+  DCHECK(!in_dtor_);
+#endif
+  base::RefCountInc(&ref_count_);
+}
+
+bool RefCountedThreadSafeBase::Release() const {
+#ifndef NDEBUG
+  DCHECK(!in_dtor_);
+  DCHECK(!base::RefCountIsZero(&ref_count_));
+#endif
+  if (!base::RefCountDec(&ref_count_)) {
+#ifndef NDEBUG
+    in_dtor_ = true;
+#endif
+    return true;
+  }
+  return false;
+}
+
+}  // namespace subtle
+
+}  // namespace doris
diff --git a/be/src/gutil/ref_counted.h b/be/src/gutil/ref_counted.h
new file mode 100644
index 0000000000..5cd73b262c
--- /dev/null
+++ b/be/src/gutil/ref_counted.h
@@ -0,0 +1,365 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef BASE_MEMORY_REF_COUNTED_H_
+#define BASE_MEMORY_REF_COUNTED_H_
+
+#include <cassert>
+#include <cstddef>
+#include <utility>  // IWYU pragma: keep
+
+#include "gutil/atomicops.h"
+#include "gutil/macros.h"
+#include "gutil/threading/thread_collision_warner.h"
+
+namespace doris {
+namespace subtle {
+
+typedef Atomic32 AtomicRefCount;
+
+class RefCountedBase {
+ public:
+  bool HasOneRef() const { return ref_count_ == 1; }
+
+ protected:
+  RefCountedBase();
+  ~RefCountedBase();
+
+  void AddRef() const;
+
+  // Returns true if the object should self-delete.
+  bool Release() const;
+
+ private:
+  mutable int ref_count_;
+#ifndef NDEBUG
+  mutable bool in_dtor_;
+#endif
+
+  DFAKE_MUTEX(add_release_);
+
+  DISALLOW_COPY_AND_ASSIGN(RefCountedBase);
+};
+
+class RefCountedThreadSafeBase {
+ public:
+  bool HasOneRef() const;
+
+ protected:
+  RefCountedThreadSafeBase();
+  ~RefCountedThreadSafeBase();
+
+  void AddRef() const;
+
+  // Returns true if the object should self-delete.
+  bool Release() const;
+
+ private:
+  mutable AtomicRefCount ref_count_;
+#ifndef NDEBUG
+  mutable bool in_dtor_;
+#endif
+
+  DISALLOW_COPY_AND_ASSIGN(RefCountedThreadSafeBase);
+};
+
+}  // namespace subtle
+
+//
+// A base class for reference counted classes.  Otherwise, known as a cheap
+// knock-off of WebKit's RefCounted<T> class.  To use this guy just extend your
+// class from it like so:
+//
+//   class MyFoo : public RefCounted<MyFoo> {
+//    ...
+//    private:
+//     friend class RefCounted<MyFoo>;
+//     ~MyFoo();
+//   };
+//
+// You should always make your destructor private, to avoid any code deleting
+// the object accidently while there are references to it.
+template <class T>
+class RefCounted : public subtle::RefCountedBase {
+ public:
+  RefCounted() {}
+
+  void AddRef() const {
+    subtle::RefCountedBase::AddRef();
+  }
+
+  void Release() const {
+    if (subtle::RefCountedBase::Release()) {
+      delete static_cast<const T*>(this);
+    }
+  }
+
+ protected:
+  ~RefCounted() {}
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(RefCounted<T>);
+};
+
+// Forward declaration.
+template <class T, typename Traits> class RefCountedThreadSafe;
+
+// Default traits for RefCountedThreadSafe<T>.  Deletes the object when its ref
+// count reaches 0.  Overload to delete it on a different thread etc.
+template<typename T>
+struct DefaultRefCountedThreadSafeTraits {
+  static void Destruct(const T* x) {
+    // Delete through RefCountedThreadSafe to make child classes only need to be
+    // friend with RefCountedThreadSafe instead of this struct, which is an
+    // implementation detail.
+    RefCountedThreadSafe<T,
+                         DefaultRefCountedThreadSafeTraits>::DeleteInternal(x);
+  }
+};
+
+//
+// A thread-safe variant of RefCounted<T>
+//
+//   class MyFoo : public RefCountedThreadSafe<MyFoo> {
+//    ...
+//   };
+//
+// If you're using the default trait, then you should add compile time
+// asserts that no one else is deleting your object.  i.e.
+//    private:
+//     friend class RefCountedThreadSafe<MyFoo>;
+//     ~MyFoo();
+template <class T, typename Traits = DefaultRefCountedThreadSafeTraits<T> >
+class RefCountedThreadSafe : public subtle::RefCountedThreadSafeBase {
+ public:
+  RefCountedThreadSafe() {}
+
+  void AddRef() const {
+    subtle::RefCountedThreadSafeBase::AddRef();
+  }
+
+  void Release() const {
+    if (subtle::RefCountedThreadSafeBase::Release()) {
+      Traits::Destruct(static_cast<const T*>(this));
+    }
+  }
+
+ protected:
+  ~RefCountedThreadSafe() {}
+
+ private:
+  friend struct DefaultRefCountedThreadSafeTraits<T>;
+  static void DeleteInternal(const T* x) { delete x; }
+
+  DISALLOW_COPY_AND_ASSIGN(RefCountedThreadSafe);
+};
+
+//
+// A thread-safe wrapper for some piece of data so we can place other
+// things in scoped_refptrs<>.
+//
+template<typename T>
+class RefCountedData
+    : public doris::RefCountedThreadSafe< doris::RefCountedData<T> > {
+ public:
+  RefCountedData() : data() {}
+  RefCountedData(const T& in_value) : data(in_value) {}
+
+  T data;
+
+ private:
+  friend class doris::RefCountedThreadSafe<doris::RefCountedData<T> >;
+  ~RefCountedData() {}
+};
+
+}  // namespace doris
+
+//
+// A smart pointer class for reference counted objects.  Use this class instead
+// of calling AddRef and Release manually on a reference counted object to
+// avoid common memory leaks caused by forgetting to Release an object
+// reference.  Sample usage:
+//
+//   class MyFoo : public RefCounted<MyFoo> {
+//    ...
+//   };
+//
+//   void some_function() {
+//     scoped_refptr<MyFoo> foo = new MyFoo();
+//     foo->Method(param);
+//     // |foo| is released when this function returns
+//   }
+//
+//   void some_other_function() {
+//     scoped_refptr<MyFoo> foo = new MyFoo();
+//     ...
+//     foo = NULL;  // explicitly releases |foo|
+//     ...
+//     if (foo)
+//       foo->Method(param);
+//   }
+//
+// The above examples show how scoped_refptr<T> acts like a pointer to T.
+// Given two scoped_refptr<T> classes, it is also possible to exchange
+// references between the two objects, like so:
+//
+//   {
+//     scoped_refptr<MyFoo> a = new MyFoo();
+//     scoped_refptr<MyFoo> b;
+//
+//     b.swap(a);
+//     // now, |b| references the MyFoo object, and |a| references NULL.
+//   }
+//
+// To make both |a| and |b| in the above example reference the same MyFoo
+// object, simply use the assignment operator:
+//
+//   {
+//     scoped_refptr<MyFoo> a = new MyFoo();
+//     scoped_refptr<MyFoo> b;
+//
+//     b = a;
+//     // now, |a| and |b| each own a reference to the same MyFoo object.
+//   }
+//
+template <class T>
+class scoped_refptr {
+ public:
+  typedef T element_type;
+
+  scoped_refptr() : ptr_(NULL) {
+  }
+
+  scoped_refptr(T* p) : ptr_(p) {
+    if (ptr_)
+      ptr_->AddRef();
+  }
+
+  // Copy constructor.
+  scoped_refptr(const scoped_refptr<T>& r) : ptr_(r.ptr_) {
+    if (ptr_)
+      ptr_->AddRef();
+  }
+
+  // Copy conversion constructor.
+  template <typename U>
+  scoped_refptr(const scoped_refptr<U>& r) : ptr_(r.get()) {
+    if (ptr_)
+      ptr_->AddRef();
+  }
+
+  // Move constructor. This is required in addition to the conversion
+  // constructor below in order for clang to warn about pessimizing moves.
+  scoped_refptr(scoped_refptr&& r) noexcept : ptr_(r.get()) { // NOLINT
+    r.ptr_ = nullptr;
+  }
+
+  // Move conversion constructor.
+  template <typename U>
+  scoped_refptr(scoped_refptr<U>&& r) noexcept : ptr_(r.get()) { // NOLINT
+    r.ptr_ = nullptr;
+  }
+
+  ~scoped_refptr() {
+    if (ptr_)
+      ptr_->Release();
+  }
+
+  T* get() const { return ptr_; }
+
+// The following is disabled in Kudu's version of this file since it's
+// relatively dangerous. Chromium is planning on doing the same in their
+// tree, but hasn't done so yet. See http://code.google.com/p/chromium/issues/detail?id=110610
+#if SCOPED_REFPTR_ALLOW_IMPLICIT_CONVERSION_TO_PTR
+  // Allow scoped_refptr<C> to be used in boolean expression
+  // and comparison operations.
+  operator T*() const { return ptr_; }
+#else
+  typedef T* scoped_refptr::*Testable;
+  operator Testable() const { return ptr_ ? &scoped_refptr::ptr_ : NULL; }
+#endif
+
+  T* operator->() const {
+    assert(ptr_ != NULL);
+    return ptr_;
+  }
+
+  scoped_refptr<T>& operator=(T* p) {
+    // AddRef first so that self assignment should work
+    if (p)
+      p->AddRef();
+    T* old_ptr = ptr_;
+    ptr_ = p;
+    if (old_ptr)
+      old_ptr->Release();
+    return *this;
+  }
+
+  scoped_refptr<T>& operator=(const scoped_refptr<T>& r) {
+    return *this = r.ptr_;
+  }
+
+  template <typename U>
+  scoped_refptr<T>& operator=(const scoped_refptr<U>& r) {
+    return *this = r.get();
+  }
+
+  scoped_refptr<T>& operator=(scoped_refptr<T>&& r) {
+    scoped_refptr<T>(std::move(r)).swap(*this);
+    return *this;
+  }
+
+  template <typename U>
+  scoped_refptr<T>& operator=(scoped_refptr<U>&& r) {
+    scoped_refptr<T>(std::move(r)).swap(*this);
+    return *this;
+  }
+
+  void swap(T** pp) {
+    T* p = ptr_;
+    ptr_ = *pp;
+    *pp = p;
+  }
+
+  void swap(scoped_refptr<T>& r) {
+    swap(&r.ptr_);
+  }
+
+  // Like gscoped_ptr::reset(), drops a reference on the currently held object
+  // (if any), and adds a reference to the passed-in object (if not NULL).
+  void reset(T* p = NULL) {
+    *this = p;
+  }
+
+ protected:
+  T* ptr_;
+
+ private:
+  template <typename U> friend class scoped_refptr;
+};
+
+// Handy utility for creating a scoped_refptr<T> out of a T* explicitly without
+// having to retype all the template arguments
+template <typename T>
+scoped_refptr<T> make_scoped_refptr(T* t) {
+  return scoped_refptr<T>(t);
+}
+
+// equal_to and hash implementations for templated scoped_refptrs suitable for
+// use with STL unordered_* containers.
+template <class T>
+struct ScopedRefPtrEqualToFunctor {
+  bool operator()(const scoped_refptr<T>& x, const scoped_refptr<T>& y) const {
+    return x.get() == y.get();
+  }
+};
+
+template <class T>
+struct ScopedRefPtrHashFunctor {
+  size_t operator()(const scoped_refptr<T>& p) const {
+    return reinterpret_cast<size_t>(p.get());
+  }
+};
+
+#endif  // BASE_MEMORY_REF_COUNTED_H_
diff --git a/be/src/gutil/sysinfo-test.cc b/be/src/gutil/sysinfo-test.cc
new file mode 100644
index 0000000000..85ccd11448
--- /dev/null
+++ b/be/src/gutil/sysinfo-test.cc
@@ -0,0 +1,71 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gutil/sysinfo.h"
+
+#include <gtest/gtest.h>
+
+namespace doris {
+
+TEST(SysInfoTest, ReadMaxCpuIndexTest) {
+  using base::ParseMaxCpuIndex;
+  EXPECT_EQ(0, ParseMaxCpuIndex("0\n"));
+  EXPECT_EQ(1, ParseMaxCpuIndex("1\n"));
+  EXPECT_EQ(7, ParseMaxCpuIndex("0-7\n"));
+  EXPECT_EQ(40, ParseMaxCpuIndex("0-7,30-40\n"));
+  EXPECT_EQ(143, ParseMaxCpuIndex("2,4-127,128-143\n"));
+  EXPECT_EQ(44, ParseMaxCpuIndex("44-44\n"));
+
+  // Don't assume that ranges are in ascending order or non-overlapping,
+  // just in case.
+  EXPECT_EQ(8, ParseMaxCpuIndex("0-7,5-8\n"));
+  EXPECT_EQ(7, ParseMaxCpuIndex("0-7,5-6\n"));
+  EXPECT_EQ(3, ParseMaxCpuIndex("2-3,0-1\n"));
+  EXPECT_EQ(3, ParseMaxCpuIndex("2-3,0\n"));
+  EXPECT_EQ(3, ParseMaxCpuIndex("3,0-2\n"));
+
+  // Invalid inputs.
+  EXPECT_EQ(-1, ParseMaxCpuIndex(""));
+  EXPECT_EQ(-1, ParseMaxCpuIndex("\n"));
+  EXPECT_EQ(-1, ParseMaxCpuIndex(" "));
+  EXPECT_EQ(-1, ParseMaxCpuIndex("a\n"));
+  EXPECT_EQ(-1, ParseMaxCpuIndex("0\n1\n"));
+  EXPECT_EQ(-1, ParseMaxCpuIndex("\n1\n"));
+  EXPECT_EQ(-1, ParseMaxCpuIndex("\n1"));
+  EXPECT_EQ(-1, ParseMaxCpuIndex("0-\n"));
+  EXPECT_EQ(-1, ParseMaxCpuIndex("-2\n"));
+  EXPECT_EQ(-1, ParseMaxCpuIndex("1-9qwerty\n"));
+  EXPECT_EQ(-1, ParseMaxCpuIndex("1-9,0-\n"));
+  EXPECT_EQ(-1, ParseMaxCpuIndex("1,2,3-\n"));
+  EXPECT_EQ(-1, ParseMaxCpuIndex("1,2,a-4\n"));
+  EXPECT_EQ(-1, ParseMaxCpuIndex("1,2,3@4\n"));
+  EXPECT_EQ(-1, ParseMaxCpuIndex("1,2,\n"));
+  EXPECT_EQ(-1, ParseMaxCpuIndex("3-2\n"));
+
+  // Overflows in various positions.
+  EXPECT_EQ(-1, ParseMaxCpuIndex("2147483648")); // 2^31
+  EXPECT_EQ(-1, ParseMaxCpuIndex("18446744073709551617")); // 2^64 + 1
+  EXPECT_EQ(-1, ParseMaxCpuIndex("999999999999999999999999999999999999999999999999"));
+  EXPECT_EQ(-1, ParseMaxCpuIndex("0-2147483648")); // 2^31
+  EXPECT_EQ(-1, ParseMaxCpuIndex("0-18446744073709551617")); // 2^64 + 1
+  EXPECT_EQ(-1, ParseMaxCpuIndex("0-999999999999999999999999999999999999999999999999"));
+  EXPECT_EQ(-1, ParseMaxCpuIndex("2147483648-1")); // 2^31
+  EXPECT_EQ(-1, ParseMaxCpuIndex("18446744073709551617-1")); // 2^64 + 1
+  EXPECT_EQ(-1, ParseMaxCpuIndex("999999999999999999999999999999999999999999999999-1"));
+}
+
+} // namespace doris
diff --git a/be/src/gutil/sysinfo.cc b/be/src/gutil/sysinfo.cc
new file mode 100644
index 0000000000..4365b18ff8
--- /dev/null
+++ b/be/src/gutil/sysinfo.cc
@@ -0,0 +1,474 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#if (defined(_WIN32) || defined(__MINGW32__)) && !defined(__CYGWIN__) && !defined(__CYGWIN32)
+# define PLATFORM_WINDOWS 1
+#endif
+
+#include <ctype.h>
+#include <fcntl.h>    // for open()
+#include <unistd.h>   // for read()
+
+#if defined __MACH__          // Mac OS X, almost certainly
+#include <sys/sysctl.h>       // how we figure out numcpu's on OS X
+#include <sys/types.h>
+#elif defined __FreeBSD__
+#include <sys/sysctl.h>
+#elif defined __sun__         // Solaris
+#include <procfs.h>           // for, e.g., prmap_t
+#elif defined(PLATFORM_WINDOWS)
+#include <process.h>          // for getpid() (actually, _getpid())
+#include <shlwapi.h>          // for SHGetValueA()
+#include <tlhelp32.h>         // for Module32First()
+#endif
+
+#include "gutil/sysinfo.h"
+
+#include <algorithm>
+#include <cerrno>    // for errno
+#include <cstdio>    // for snprintf(), sscanf()
+#include <cstdlib>   // for getenv()
+#include <cstring>   // for memmove(), memchr(), etc.
+#include <ctime>
+#include <limits>
+#include <ostream>
+
+#include <glog/logging.h>
+
+#include "gutil/dynamic_annotations.h"   // for RunningOnValgrind
+#include "gutil/integral_types.h"
+#include "gutil/macros.h"
+#include "gutil/port.h"
+#include "gutil/walltime.h"
+
+using std::numeric_limits;
+
+// This isn't in the 'base' namespace in tcmallc. But, tcmalloc
+// exports these functions, so we need to namespace them to avoid
+// the conflict.
+namespace base {
+
+// ----------------------------------------------------------------------
+// CyclesPerSecond()
+// NumCPUs()
+//    It's important this not call malloc! -- they may be called at
+//    global-construct time, before we've set up all our proper malloc
+//    hooks and such.
+// ----------------------------------------------------------------------
+
+static double cpuinfo_cycles_per_second = 1.0;  // 0.0 might be dangerous
+static int cpuinfo_num_cpus = 1;  // Conservative guess
+static int cpuinfo_max_cpu_index = -1;
+
+void SleepForNanoseconds(int64_t nanoseconds) {
+  // Sleep for nanosecond duration
+  struct timespec sleep_time;
+  sleep_time.tv_sec = nanoseconds / 1000 / 1000 / 1000;
+  sleep_time.tv_nsec = (nanoseconds % (1000 * 1000 * 1000));
+  while (nanosleep(&sleep_time, &sleep_time) != 0 && errno == EINTR)
+    ;  // Ignore signals and wait for the full interval to elapse.
+}
+
+void SleepForMilliseconds(int64_t milliseconds) {
+  SleepForNanoseconds(milliseconds * 1000 * 1000);
+}
+
+// Helper function estimates cycles/sec by observing cycles elapsed during
+// sleep(). Using small sleep time decreases accuracy significantly.
+static int64 EstimateCyclesPerSecond(const int estimate_time_ms) {
+  CHECK(estimate_time_ms > 0);
+  if (estimate_time_ms <= 0)
+    return 1;
+  double multiplier = 1000.0 / (double)estimate_time_ms;  // scale by this much
+
+  const int64 start_ticks = CycleClock::Now();
+  SleepForMilliseconds(estimate_time_ms);
+  const int64 guess = int64(multiplier * (CycleClock::Now() - start_ticks));
+  return guess;
+}
+
+// ReadIntFromFile is only called on linux and cygwin platforms.
+#if defined(__linux__) || defined(__CYGWIN__) || defined(__CYGWIN32__)
+
+// Slurp a file with a single read() call into 'buf'. This is only safe to use on small
+// files in places like /proc where we are guaranteed not to get a partial read.
+// Any remaining bytes in the buffer are zeroed.
+//
+// 'buflen' must be more than large enough to hold the whole file, or else this will
+// issue a FATAL error.
+static bool SlurpSmallTextFile(const char* file, char* buf, int buflen) {
+  bool ret = false;
+  int fd;
+  RETRY_ON_EINTR(fd, open(file, O_RDONLY));
+  if (fd == -1) return ret;
+
+  memset(buf, '\0', buflen);
+  int n;
+  RETRY_ON_EINTR(n, read(fd, buf, buflen - 1));
+  CHECK_NE(n, buflen - 1) << "buffer of len " << buflen << " not large enough to store "
+                          << "contents of " << file;
+  if (n > 0) {
+    ret = true;
+  }
+
+  int close_ret;
+  RETRY_ON_EINTR(close_ret, close(fd));
+  if (PREDICT_FALSE(close_ret != 0)) {
+    PLOG(WARNING) << "Failed to close fd " << fd;
+  }
+
+  return ret;
+}
+
+// Helper function for reading an int from a file. Returns true if successful
+// and the memory location pointed to by value is set to the value read.
+static bool ReadIntFromFile(const char *file, int *value) {
+  char line[1024];
+  if (!SlurpSmallTextFile(file, line, arraysize(line))) {
+    return false;
+  }
+  char* err;
+  const int temp_value = strtol(line, &err, 10);
+  if (line[0] != '\0' && (*err == '\n' || *err == '\0')) {
+    *value = temp_value;
+    return true;
+  }
+  return false;
+}
+
+static int ReadMaxCPUIndex() {
+  char buf[1024];
+  // TODO(tarmstrong): KUDU-2730: 'present' doesn't include CPUs that could be hotplugged
+  // in the future. 'possible' does, but using it instead could result in a blow-up in the
+  // number of per-CPU data structures.
+  CHECK(SlurpSmallTextFile("/sys/devices/system/cpu/present", buf, arraysize(buf)));
+  int max_idx = ParseMaxCpuIndex(buf);
+  CHECK_GE(max_idx, 0) << "unable to parse max CPU index from: " << buf;
+  return max_idx;
+}
+
+int ParseMaxCpuIndex(const char* str) {
+  DCHECK(str != nullptr);
+  const char* pos = str;
+  // Initialize max_idx to invalid so we can just return if we find zero ranges.
+  int max_idx = -1;
+
+  while (true) {
+    const char* range_start = pos;
+    const char* dash = nullptr;
+    // Scan forward until we find the separator indicating end of range, which is always a
+    // newline or comma if the input is valid.
+    for (; *pos != ',' && *pos != '\n'; pos++) {
+      // Check for early end of string - bail here to avoid advancing past end.
+      if (*pos == '\0') return -1;
+      if (*pos == '-') {
+        // Multiple dashes in range is invalid.
+        if (dash != nullptr) return -1;
+        dash = pos;
+      } else if (!isdigit(*pos)) {
+        return -1;
+      }
+    }
+
+    // At this point we found a range [range_start, pos) comprised of digits and an
+    // optional dash.
+    const char* num_start = dash == nullptr ? range_start : dash + 1;
+    // Check for ranges with missing numbers, e.g. "", "3-", "-3".
+    if (num_start == pos || dash == range_start) return -1;
+    // The numbers are comprised only of digits, so it can only fail if it is out of
+    // range of int (the return type of this function).
+    unsigned long start_idx = strtoul(range_start, nullptr, 10);
+    if (start_idx > numeric_limits<int>::max()) return -1;
+    unsigned long end_idx = strtoul(num_start, nullptr, 10);
+    if (end_idx > numeric_limits<int>::max() || start_idx > end_idx) {
+      return -1;
+    }
+    // Keep track of the max index we've seen so far.
+    max_idx = std::max(static_cast<int>(end_idx), max_idx);
+    // End of line, expect no more input.
+    if (*pos == '\n') break;
+    ++pos;
+  }
+  // String must have a single newline at the very end.
+  if (*pos != '\n' || *(pos + 1) != '\0') return -1;
+  return max_idx;
+}
+
+#endif
+
+// WARNING: logging calls back to InitializeSystemInfo() so it must
+// not invoke any logging code.  Also, InitializeSystemInfo() can be
+// called before main() -- in fact it *must* be since already_called
+// isn't protected -- before malloc hooks are properly set up, so
+// we make an effort not to call any routines which might allocate
+// memory.
+
+static void InitializeSystemInfo() {
+  static bool already_called = false;   // safe if we run before threads
+  if (already_called)  return;
+  already_called = true;
+
+  bool saw_mhz = false;
+
+  if (RunningOnValgrind()) {
+    // Valgrind may slow the progress of time artificially (--scale-time=N
+    // option). We thus can't rely on CPU Mhz info stored in /sys or /proc
+    // files. Thus, actually measure the cps.
+    cpuinfo_cycles_per_second = EstimateCyclesPerSecond(100);
+    saw_mhz = true;
+  }
+
+#if defined(__linux__) || defined(__CYGWIN__) || defined(__CYGWIN32__)
+  char line[1024];
+  char* err;
+  int freq;
+
+  // If the kernel is exporting the tsc frequency use that. There are issues
+  // where cpuinfo_max_freq cannot be relied on because the BIOS may be
+  // exporintg an invalid p-state (on x86) or p-states may be used to put the
+  // processor in a new mode (turbo mode). Essentially, those frequencies
+  // cannot always be relied upon. The same reasons apply to /proc/cpuinfo as
+  // well.
+  if (!saw_mhz &&
+      ReadIntFromFile("/sys/devices/system/cpu/cpu0/tsc_freq_khz", &freq)) {
+      // The value is in kHz (as the file name suggests).  For example, on a
+      // 2GHz warpstation, the file contains the value "2000000".
+      cpuinfo_cycles_per_second = freq * 1000.0;
+      saw_mhz = true;
+  }
+
+  // If CPU scaling is in effect, we want to use the *maximum* frequency,
+  // not whatever CPU speed some random processor happens to be using now.
+  if (!saw_mhz &&
+      ReadIntFromFile("/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq",
+                      &freq)) {
+    // The value is in kHz.  For example, on a 2GHz machine, the file
+    // contains the value "2000000".
+    cpuinfo_cycles_per_second = freq * 1000.0;
+    saw_mhz = true;
+  }
+
+  // Read /proc/cpuinfo for other values, and if there is no cpuinfo_max_freq.
+  const char* pname = "/proc/cpuinfo";
+  int fd;
+  RETRY_ON_EINTR(fd, open(pname, O_RDONLY));
+  if (fd == -1) {
+    PLOG(FATAL) << "Unable to read CPU info from /proc. procfs must be mounted.";
+  }
+
+  double bogo_clock = 1.0;
+  bool saw_bogo = false;
+  int num_cpus = 0;
+  line[0] = line[1] = '\0';
+  int chars_read = 0;
+  do {   // we'll exit when the last read didn't read anything
+    // Move the next line to the beginning of the buffer
+    const int oldlinelen = strlen(line);
+    if (sizeof(line) == oldlinelen + 1)    // oldlinelen took up entire line
+      line[0] = '\0';
+    else                                   // still other lines left to save
+      memmove(line, line + oldlinelen+1, sizeof(line) - (oldlinelen+1));
+    // Terminate the new line, reading more if we can't find the newline
+    char* newline = strchr(line, '\n');
+    if (newline == NULL) {
+      const int linelen = strlen(line);
+      const int bytes_to_read = sizeof(line)-1 - linelen;
+      CHECK(bytes_to_read > 0);  // because the memmove recovered >=1 bytes
+      RETRY_ON_EINTR(chars_read, read(fd, line + linelen, bytes_to_read));
+      line[linelen + chars_read] = '\0';
+      newline = strchr(line, '\n');
+    }
+    if (newline != NULL)
+      *newline = '\0';
+
+#if defined(__powerpc__) || defined(__ppc__)
+    // PowerPC cpus report the frequency in "clock" line
+    if (strncasecmp(line, "clock", sizeof("clock")-1) == 0) {
+      const char* freqstr = strchr(line, ':');
+      if (freqstr) {
+        // PowerPC frequencies are only reported as MHz (check 'show_cpuinfo'
+        // function at arch/powerpc/kernel/setup-common.c)
+        char *endp = strstr(line, "MHz");
+        if (endp) {
+          *endp = 0;
+          cpuinfo_cycles_per_second = strtod(freqstr+1, &err) * 1000000.0;
+          if (freqstr[1] != '\0' && *err == '\0' && cpuinfo_cycles_per_second > 0)
+            saw_mhz = true;
+        }
+      }
+#else
+    // When parsing the "cpu MHz" and "bogomips" (fallback) entries, we only
+    // accept postive values. Some environments (virtual machines) report zero,
+    // which would cause infinite looping in WallTime_Init.
+    if (!saw_mhz && strncasecmp(line, "cpu MHz", sizeof("cpu MHz")-1) == 0) {
+      const char* freqstr = strchr(line, ':');
+      if (freqstr) {
+        cpuinfo_cycles_per_second = strtod(freqstr+1, &err) * 1000000.0;
+        if (freqstr[1] != '\0' && *err == '\0' && cpuinfo_cycles_per_second > 0)
+          saw_mhz = true;
+      }
+    } else if (strncasecmp(line, "bogomips", sizeof("bogomips")-1) == 0) {
+      const char* freqstr = strchr(line, ':');
+      if (freqstr) {
+        bogo_clock = strtod(freqstr+1, &err) * 1000000.0;
+        if (freqstr[1] != '\0' && *err == '\0' && bogo_clock > 0)
+          saw_bogo = true;
+      }
+#endif
+    } else if (strncasecmp(line, "processor", sizeof("processor")-1) == 0) {
+      num_cpus++;  // count up every time we see an "processor :" entry
+    }
+  } while (chars_read > 0);
+  int ret;
+  RETRY_ON_EINTR(ret, close(fd));
+  if (PREDICT_FALSE(ret != 0)) {
+    PLOG(WARNING) << "Failed to close fd " << fd;
+  }
+
+  if (!saw_mhz) {
+    if (saw_bogo) {
+      // If we didn't find anything better, we'll use bogomips, but
+      // we're not happy about it.
+      cpuinfo_cycles_per_second = bogo_clock;
+    } else {
+      // If we don't even have bogomips, we'll use the slow estimation.
+      cpuinfo_cycles_per_second = EstimateCyclesPerSecond(1000);
+    }
+  }
+  if (cpuinfo_cycles_per_second == 0.0) {
+    cpuinfo_cycles_per_second = 1.0;   // maybe unnecessary, but safe
+  }
+  if (num_cpus > 0) {
+    cpuinfo_num_cpus = num_cpus;
+  }
+  cpuinfo_max_cpu_index = ReadMaxCPUIndex();
+
+#elif defined __FreeBSD__
+  // For this sysctl to work, the machine must be configured without
+  // SMP, APIC, or APM support.  hz should be 64-bit in freebsd 7.0
+  // and later.  Before that, it's a 32-bit quantity (and gives the
+  // wrong answer on machines faster than 2^32 Hz).  See
+  //  http://lists.freebsd.org/pipermail/freebsd-i386/2004-November/001846.html
+  // But also compare FreeBSD 7.0:
+  //  http://fxr.watson.org/fxr/source/i386/i386/tsc.c?v=RELENG70#L223
+  //  231         error = sysctl_handle_quad(oidp, &freq, 0, req);
+  // To FreeBSD 6.3 (it's the same in 6-STABLE):
+  //  http://fxr.watson.org/fxr/source/i386/i386/tsc.c?v=RELENG6#L131
+  //  139         error = sysctl_handle_int(oidp, &freq, sizeof(freq), req);
+#if __FreeBSD__ >= 7
+  uint64_t hz = 0;
+#else
+  unsigned int hz = 0;
+#endif
+  size_t sz = sizeof(hz);
+  const char *sysctl_path = "machdep.tsc_freq";
+  if ( sysctlbyname(sysctl_path, &hz, &sz, NULL, 0) != 0 ) {
+    fprintf(stderr, "Unable to determine clock rate from sysctl: %s: %s\n",
+            sysctl_path, strerror(errno));
+    cpuinfo_cycles_per_second = EstimateCyclesPerSecond(1000);
+  } else {
+    cpuinfo_cycles_per_second = hz;
+  }
+  // TODO(csilvers): also figure out cpuinfo_num_cpus
+
+#elif defined(PLATFORM_WINDOWS)
+# pragma comment(lib, "shlwapi.lib")  // for SHGetValue()
+  // In NT, read MHz from the registry. If we fail to do so or we're in win9x
+  // then make a crude estimate.
+  OSVERSIONINFO os;
+  os.dwOSVersionInfoSize = sizeof(os);
+  DWORD data, data_size = sizeof(data);
+  if (GetVersionEx(&os) &&
+      os.dwPlatformId == VER_PLATFORM_WIN32_NT &&
+      SUCCEEDED(SHGetValueA(HKEY_LOCAL_MACHINE,
+                         "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0",
+                           "~MHz", NULL, &data, &data_size)))
+    cpuinfo_cycles_per_second = (int64)data * (int64)(1000 * 1000); // was mhz
+  else
+    cpuinfo_cycles_per_second = EstimateCyclesPerSecond(500); // TODO <500?
+
+  // Get the number of processors.
+  SYSTEM_INFO info;
+  GetSystemInfo(&info);
+  cpuinfo_num_cpus = info.dwNumberOfProcessors;
+
+#elif defined(__MACH__) && defined(__APPLE__)
+  // returning "mach time units" per second. the current number of elapsed
+  // mach time units can be found by calling uint64 mach_absolute_time();
+  // while not as precise as actual CPU cycles, it is accurate in the face
+  // of CPU frequency scaling and multi-cpu/core machines.
+  // Our mac users have these types of machines, and accuracy
+  // (i.e. correctness) trumps precision.
+  // See cycleclock.h: CycleClock::Now(), which returns number of mach time
+  // units on Mac OS X.
+  mach_timebase_info_data_t timebase_info;
+  mach_timebase_info(&timebase_info);
+  double mach_time_units_per_nanosecond =
+      static_cast<double>(timebase_info.denom) /
+      static_cast<double>(timebase_info.numer);
+  cpuinfo_cycles_per_second = mach_time_units_per_nanosecond * 1e9;
+
+  int num_cpus = 0;
+  size_t size = sizeof(num_cpus);
+  int numcpus_name[] = { CTL_HW, HW_NCPU };
+  if (::sysctl(numcpus_name, arraysize(numcpus_name), &num_cpus, &size, nullptr, 0)
+      == 0
+      && (size == sizeof(num_cpus)))
+    cpuinfo_num_cpus = num_cpus;
+
+#else
+  // Generic cycles per second counter
+  cpuinfo_cycles_per_second = EstimateCyclesPerSecond(1000);
+#endif
+
+  // On platforms where we can't determine the max CPU index, just use the
+  // number of CPUs. This might break if CPUs are taken offline, but
+  // better than a wild guess.
+  if (cpuinfo_max_cpu_index < 0) {
+    cpuinfo_max_cpu_index = cpuinfo_num_cpus - 1;
+  }
+}
+
+double CyclesPerSecond(void) {
+  InitializeSystemInfo();
+  return cpuinfo_cycles_per_second;
+}
+
+int NumCPUs(void) {
+  InitializeSystemInfo();
+  return cpuinfo_num_cpus;
+}
+
+int MaxCPUIndex(void) {
+  InitializeSystemInfo();
+  return cpuinfo_max_cpu_index;
+}
+
+} // namespace base
diff --git a/be/src/gutil/sysinfo.h b/be/src/gutil/sysinfo.h
new file mode 100644
index 0000000000..d46cfe5504
--- /dev/null
+++ b/be/src/gutil/sysinfo.h
@@ -0,0 +1,69 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2006, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef _SYSINFO_H_
+#define _SYSINFO_H_
+
+#include <cstdint>
+
+namespace base {
+
+// Return the number of online CPUs. This is computed and cached the first time this or
+// NumCPUs() is called, so does not reflect any CPUs enabled or disabled at a later
+// point in time.
+//
+// Note that, if not all CPUs are online, this may return a value lower than the maximum
+// value of sched_getcpu().
+extern int NumCPUs();
+
+// Return the maximum CPU index that may be returned by sched_getcpu(). For example, on
+// an 8-core machine, this will return '7' even if some of the CPUs have been disabled.
+extern int MaxCPUIndex();
+
+void SleepForNanoseconds(int64_t nanoseconds);
+void SleepForMilliseconds(int64_t milliseconds);
+
+// processor cycles per second of each processor.  Thread-safe.
+extern double CyclesPerSecond(void);
+
+// Parse the maximum CPU index from 'str'. The list is in the format of the CPU lists
+// under /sys/devices/system/cpu/, e.g. /sys/devices/system/cpu/present. Returns the
+// index of the max CPU or -1 if the string could not be parsed.
+// Examples of the format and the expected output include:
+// * "0\n" -> 0
+// * "0-8\n" -> 8
+// * "0-15,32-47\n" -> 47
+// * "2,4-127,128-143\n" -> 143
+// Ref: https://www.kernel.org/doc/Documentation/cputopology.txt
+// Exposed for testing.
+extern int ParseMaxCpuIndex(const char* str);
+
+} // namespace base
+#endif   /* #ifndef _SYSINFO_H_ */
diff --git a/be/src/gutil/threading/thread_collision_warner.cc b/be/src/gutil/threading/thread_collision_warner.cc
new file mode 100644
index 0000000000..26329cfb9c
--- /dev/null
+++ b/be/src/gutil/threading/thread_collision_warner.cc
@@ -0,0 +1,93 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "gutil/threading/thread_collision_warner.h"
+
+#ifdef __linux__
+#include <syscall.h>
+#else
+#include <sys/syscall.h>
+#endif
+
+#include <unistd.h>
+
+#include <cstdint>
+#include <ostream>
+
+#include <glog/logging.h>
+
+namespace base {
+
+void DCheckAsserter::warn(int64_t previous_thread_id, int64_t current_thread_id) {
+  LOG(FATAL) << "Thread Collision! Previous thread id: " << previous_thread_id
+             << ", current thread id: " << current_thread_id;
+}
+
+#if 0
+// Original source from Chromium -- we didn't import their threading library
+// into Kudu source as of yet
+
+static subtle::Atomic32 CurrentThread() {
+  const PlatformThreadId current_thread_id = PlatformThread::CurrentId();
+  // We need to get the thread id into an atomic data type. This might be a
+  // truncating conversion, but any loss-of-information just increases the
+  // chance of a fault negative, not a false positive.
+  const subtle::Atomic32 atomic_thread_id =
+      static_cast<subtle::Atomic32>(current_thread_id);
+
+  return atomic_thread_id;
+}
+#else
+
+static subtle::Atomic64 CurrentThread() {
+#if defined(__APPLE__)
+  uint64_t tid;
+  CHECK_EQ(0, pthread_threadid_np(NULL, &tid));
+  return tid;
+#elif defined(__linux__)
+  return syscall(__NR_gettid);
+#endif
+}
+
+#endif
+
+void ThreadCollisionWarner::EnterSelf() {
+  // If the active thread is 0 then I'll write the current thread ID
+  // if two or more threads arrive here only one will succeed to
+  // write on valid_thread_id_ the current thread ID.
+  subtle::Atomic64 current_thread_id = CurrentThread();
+
+  int64_t previous_thread_id = subtle::NoBarrier_CompareAndSwap(&valid_thread_id_,
+                                                                0,
+                                                                current_thread_id);
+  if (previous_thread_id != 0 && previous_thread_id != current_thread_id) {
+    // gotcha! a thread is trying to use the same class and that is
+    // not current thread.
+    asserter_->warn(previous_thread_id, current_thread_id);
+  }
+
+  subtle::NoBarrier_AtomicIncrement(&counter_, 1);
+}
+
+void ThreadCollisionWarner::Enter() {
+  subtle::Atomic64 current_thread_id = CurrentThread();
+
+  int64_t previous_thread_id = subtle::NoBarrier_CompareAndSwap(&valid_thread_id_,
+                                                                0,
+                                                                current_thread_id);
+  if (previous_thread_id != 0) {
+    // gotcha! another thread is trying to use the same class.
+    asserter_->warn(previous_thread_id, current_thread_id);
+  }
+
+  subtle::NoBarrier_AtomicIncrement(&counter_, 1);
+}
+
+void ThreadCollisionWarner::Leave() {
+  if (subtle::Barrier_AtomicIncrement(&counter_, -1) == 0) {
+    subtle::NoBarrier_Store(&valid_thread_id_, 0);
+  }
+}
+
+}  // namespace base
diff --git a/be/src/gutil/threading/thread_collision_warner.h b/be/src/gutil/threading/thread_collision_warner.h
new file mode 100644
index 0000000000..5d9a9ac692
--- /dev/null
+++ b/be/src/gutil/threading/thread_collision_warner.h
@@ -0,0 +1,247 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef BASE_THREADING_THREAD_COLLISION_WARNER_H_
+#define BASE_THREADING_THREAD_COLLISION_WARNER_H_
+
+#include <cstdint>
+
+#include "gutil/atomicops.h"
+#include "gutil/macros.h"
+
+#ifndef BASE_EXPORT
+#define BASE_EXPORT
+#endif
+
+// A helper class alongside macros to be used to verify assumptions about thread
+// safety of a class.
+//
+// Example: Queue implementation non thread-safe but still usable if clients
+//          are synchronized somehow.
+//
+//          In this case the macro DFAKE_SCOPED_LOCK has to be
+//          used, it checks that if a thread is inside the push/pop then
+//          noone else is still inside the pop/push
+//
+// class NonThreadSafeQueue {
+//  public:
+//   ...
+//   void push(int) { DFAKE_SCOPED_LOCK(push_pop_); ... }
+//   int pop() { DFAKE_SCOPED_LOCK(push_pop_); ... }
+//   ...
+//  private:
+//   DFAKE_MUTEX(push_pop_);
+// };
+//
+//
+// Example: Queue implementation non thread-safe but still usable if clients
+//          are synchronized somehow, it calls a method to "protect" from
+//          a "protected" method
+//
+//          In this case the macro DFAKE_SCOPED_RECURSIVE_LOCK
+//          has to be used, it checks that if a thread is inside the push/pop
+//          then noone else is still inside the pop/push
+//
+// class NonThreadSafeQueue {
+//  public:
+//   void push(int) {
+//     DFAKE_SCOPED_LOCK(push_pop_);
+//     ...
+//   }
+//   int pop() {
+//     DFAKE_SCOPED_RECURSIVE_LOCK(push_pop_);
+//     bar();
+//     ...
+//   }
+//   void bar() { DFAKE_SCOPED_RECURSIVE_LOCK(push_pop_); ... }
+//   ...
+//  private:
+//   DFAKE_MUTEX(push_pop_);
+// };
+//
+//
+// Example: Queue implementation not usable even if clients are synchronized,
+//          so only one thread in the class life cycle can use the two members
+//          push/pop.
+//
+//          In this case the macro DFAKE_SCOPED_LOCK_THREAD_LOCKED pins the
+//          specified
+//          critical section the first time a thread enters push or pop, from
+//          that time on only that thread is allowed to execute push or pop.
+//
+// class NonThreadSafeQueue {
+//  public:
+//   ...
+//   void push(int) { DFAKE_SCOPED_LOCK_THREAD_LOCKED(push_pop_); ... }
+//   int pop() { DFAKE_SCOPED_LOCK_THREAD_LOCKED(push_pop_); ... }
+//   ...
+//  private:
+//   DFAKE_MUTEX(push_pop_);
+// };
+//
+//
+// Example: Class that has to be contructed/destroyed on same thread, it has
+//          a "shareable" method (with external synchronization) and a not
+//          shareable method (even with external synchronization).
+//
+//          In this case 3 Critical sections have to be defined
+//
+// class ExoticClass {
+//  public:
+//   ExoticClass() { DFAKE_SCOPED_LOCK_THREAD_LOCKED(ctor_dtor_); ... }
+//   ~ExoticClass() { DFAKE_SCOPED_LOCK_THREAD_LOCKED(ctor_dtor_); ... }
+//
+//   void Shareable() { DFAKE_SCOPED_LOCK(shareable_section_); ... }
+//   void NotShareable() { DFAKE_SCOPED_LOCK_THREAD_LOCKED(ctor_dtor_); ... }
+//   ...
+//  private:
+//   DFAKE_MUTEX(ctor_dtor_);
+//   DFAKE_MUTEX(shareable_section_);
+// };
+
+
+#if !defined(NDEBUG)
+
+// Defines a class member that acts like a mutex. It is used only as a
+// verification tool.
+#define DFAKE_MUTEX(obj) \
+     mutable base::ThreadCollisionWarner obj
+// Asserts the call is never called simultaneously in two threads. Used at
+// member function scope.
+#define DFAKE_SCOPED_LOCK(obj) \
+     base::ThreadCollisionWarner::ScopedCheck s_check_##obj(&obj)
+// Asserts the call is never called simultaneously in two threads. Used at
+// member function scope. Same as DFAKE_SCOPED_LOCK but allows recursive locks.
+#define DFAKE_SCOPED_RECURSIVE_LOCK(obj) \
+     base::ThreadCollisionWarner::ScopedRecursiveCheck sr_check_##obj(&obj)
+// Asserts the code is always executed in the same thread.
+#define DFAKE_SCOPED_LOCK_THREAD_LOCKED(obj) \
+     base::ThreadCollisionWarner::Check check_##obj(&obj)
+
+#else
+
+#define DFAKE_MUTEX(obj) typedef void InternalFakeMutexType##obj
+#define DFAKE_SCOPED_LOCK(obj) ((void)0)
+#define DFAKE_SCOPED_RECURSIVE_LOCK(obj) ((void)0)
+#define DFAKE_SCOPED_LOCK_THREAD_LOCKED(obj) ((void)0)
+
+#endif
+
+namespace base {
+
+// The class ThreadCollisionWarner uses an Asserter to notify the collision
+// AsserterBase is the interfaces and DCheckAsserter is the default asserter
+// used. During the unit tests is used another class that doesn't "DCHECK"
+// in case of collision (check thread_collision_warner_unittests.cc)
+struct BASE_EXPORT AsserterBase {
+  virtual ~AsserterBase() {}
+  virtual void warn(int64_t previous_thread_id, int64_t current_thread_id) = 0;
+};
+
+struct BASE_EXPORT DCheckAsserter : public AsserterBase {
+  virtual ~DCheckAsserter() {}
+  void warn(int64_t previous_thread_id, int64_t current_thread_id) override;
+};
+
+class BASE_EXPORT ThreadCollisionWarner {
+ public:
+  // The parameter asserter is there only for test purpose
+  explicit ThreadCollisionWarner(AsserterBase* asserter = new DCheckAsserter())
+      : valid_thread_id_(0),
+        counter_(0),
+        asserter_(asserter) {}
+
+  ~ThreadCollisionWarner() {
+    delete asserter_;
+  }
+
+  // This class is meant to be used through the macro
+  // DFAKE_SCOPED_LOCK_THREAD_LOCKED
+  // it doesn't leave the critical section, as opposed to ScopedCheck,
+  // because the critical section being pinned is allowed to be used only
+  // from one thread
+  class BASE_EXPORT Check {
+   public:
+    explicit Check(ThreadCollisionWarner* warner)
+        : warner_(warner) {
+      warner_->EnterSelf();
+    }
+
+    ~Check() {}
+
+   private:
+    ThreadCollisionWarner* warner_;
+
+    DISALLOW_COPY_AND_ASSIGN(Check);
+  };
+
+  // This class is meant to be used through the macro
+  // DFAKE_SCOPED_LOCK
+  class BASE_EXPORT ScopedCheck {
+   public:
+    explicit ScopedCheck(ThreadCollisionWarner* warner)
+        : warner_(warner) {
+      warner_->Enter();
+    }
+
+    ~ScopedCheck() {
+      warner_->Leave();
+    }
+
+   private:
+    ThreadCollisionWarner* warner_;
+
+    DISALLOW_COPY_AND_ASSIGN(ScopedCheck);
+  };
+
+  // This class is meant to be used through the macro
+  // DFAKE_SCOPED_RECURSIVE_LOCK
+  class BASE_EXPORT ScopedRecursiveCheck {
+   public:
+    explicit ScopedRecursiveCheck(ThreadCollisionWarner* warner)
+        : warner_(warner) {
+      warner_->EnterSelf();
+    }
+
+    ~ScopedRecursiveCheck() {
+      warner_->Leave();
+    }
+
+   private:
+    ThreadCollisionWarner* warner_;
+
+    DISALLOW_COPY_AND_ASSIGN(ScopedRecursiveCheck);
+  };
+
+ private:
+  // This method stores the current thread identifier and does a DCHECK
+  // if a another thread has already done it, it is safe if same thread
+  // calls this multiple time (recursion allowed).
+  void EnterSelf();
+
+  // Same as EnterSelf but recursion is not allowed.
+  void Enter();
+
+  // Removes the thread_id stored in order to allow other threads to
+  // call EnterSelf or Enter.
+  void Leave();
+
+  // This stores the thread id that is inside the critical section, if the
+  // value is 0 then no thread is inside.
+  volatile subtle::Atomic64 valid_thread_id_;
+
+  // Counter to trace how many time a critical section was "pinned"
+  // (when allowed) in order to unpin it when counter_ reaches 0.
+  volatile subtle::Atomic64 counter_;
+
+  // Here only for class unit tests purpose, during the test I need to not
+  // DCHECK but notify the collision with something else.
+  AsserterBase* asserter_;
+
+  DISALLOW_COPY_AND_ASSIGN(ThreadCollisionWarner);
+};
+
+}  // namespace base
+
+#endif  // BASE_THREADING_THREAD_COLLISION_WARNER_H_
diff --git a/be/src/gutil/walltime.h b/be/src/gutil/walltime.h
new file mode 100644
index 0000000000..337535f449
--- /dev/null
+++ b/be/src/gutil/walltime.h
@@ -0,0 +1,216 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#ifndef GUTIL_WALLTIME_H_
+#define GUTIL_WALLTIME_H_
+
+#include <sys/time.h>
+
+#include <ctime>
+#include <string>
+
+#if defined(__APPLE__)
+#include <mach/clock.h>
+#include <mach/mach.h>
+#include <mach/mach_time.h>
+
+#include <glog/logging.h>
+
+#include "gutil/once.h"
+#endif  // #if defined(__APPLE__)
+
+#include "gutil/integral_types.h"
+
+typedef double WallTime;
+
+// Append result to a supplied string.
+// If an error occurs during conversion 'dst' is not modified.
+void StringAppendStrftime(std::string* dst,
+                          const char* format,
+                          time_t when,
+                          bool local);
+
+// Return the given timestamp (in seconds since the epoch) as a string suitable
+// for user display in the current timezone.
+std::string TimestampAsString(time_t timestamp_secs);
+
+// Return the local time as a string suitable for user display.
+std::string LocalTimeAsString();
+
+// Similar to the WallTime_Parse, but it takes a boolean flag local as
+// argument specifying if the time_spec is in local time or UTC
+// time. If local is set to true, the same exact result as
+// WallTime_Parse is returned.
+bool WallTime_Parse_Timezone(const char* time_spec,
+                             const char* format,
+                             const struct tm* default_time,
+                             bool local,
+                             WallTime* result);
+
+// Return current time in seconds as a WallTime.
+WallTime WallTime_Now();
+
+typedef int64 MicrosecondsInt64;
+
+namespace walltime_internal {
+
+#if defined(__APPLE__)
+
+extern GoogleOnceType timebase_info_once;
+extern mach_timebase_info_data_t timebase_info;
+extern void InitializeTimebaseInfo();
+
+inline void GetCurrentTime(mach_timespec_t* ts) {
+  clock_serv_t cclock;
+  host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock);
+  CHECK_EQ(KERN_SUCCESS, clock_get_time(cclock, ts));
+  mach_port_deallocate(mach_task_self(), cclock);
+}
+
+inline MicrosecondsInt64 GetCurrentTimeMicros() {
+  mach_timespec_t ts;
+  GetCurrentTime(&ts);
+  // 'tv_sec' is just 4 bytes on macOS, need to be careful not
+  // to convert to nanos until we've moved to a larger int.
+  MicrosecondsInt64 micros_from_secs = ts.tv_sec;
+  micros_from_secs *= 1000 * 1000;
+  micros_from_secs += ts.tv_nsec / 1000;
+  return micros_from_secs;
+}
+
+inline int64_t GetMonoTimeNanos() {
+  // See Apple Technical Q&A QA1398 for further detail on mono time in OS X.
+  GoogleOnceInit(&timebase_info_once, &InitializeTimebaseInfo);
+
+  uint64_t time = mach_absolute_time();
+
+  // mach_absolute_time returns ticks, which need to be scaled by the timebase
+  // info to get nanoseconds.
+  return time * timebase_info.numer / timebase_info.denom;
+}
+
+inline MicrosecondsInt64 GetMonoTimeMicros() {
+  return GetMonoTimeNanos() / 1000;
+}
+
+inline MicrosecondsInt64 GetThreadCpuTimeMicros() {
+  // See https://www.gnu.org/software/hurd/gnumach-doc/Thread-Information.html
+  // and Chromium base/time/time_mac.cc.
+  task_t thread = mach_thread_self();
+  if (thread == MACH_PORT_NULL) {
+    LOG(WARNING) << "Failed to get mach_thread_self()";
+    return 0;
+  }
+
+  mach_msg_type_number_t thread_info_count = THREAD_BASIC_INFO_COUNT;
+  thread_basic_info_data_t thread_info_data;
+
+  kern_return_t result = thread_info(
+      thread,
+      THREAD_BASIC_INFO,
+      reinterpret_cast<thread_info_t>(&thread_info_data),
+      &thread_info_count);
+
+  if (result != KERN_SUCCESS) {
+    LOG(WARNING) << "Failed to get thread_info()";
+    return 0;
+  }
+
+  return thread_info_data.user_time.seconds * 1000000 + thread_info_data.user_time.microseconds;
+}
+
+#else
+
+inline MicrosecondsInt64 GetClockTimeMicros(clockid_t clock) {
+  timespec ts;
+  clock_gettime(clock, &ts);
+  // 'tv_sec' is usually 8 bytes, but the spec says it only
+  // needs to be 'a signed int'. Moved to a 64 bit var before
+  // converting to micros to be safe.
+  MicrosecondsInt64 micros_from_secs = ts.tv_sec;
+  micros_from_secs *= 1000 * 1000;
+  micros_from_secs += ts.tv_nsec / 1000;
+  return micros_from_secs;
+}
+
+#endif // defined(__APPLE__)
+
+} // namespace walltime_internal
+
+// Returns the time since the Epoch measured in microseconds.
+inline MicrosecondsInt64 GetCurrentTimeMicros() {
+#if defined(__APPLE__)
+  return walltime_internal::GetCurrentTimeMicros();
+#else
+  return walltime_internal::GetClockTimeMicros(CLOCK_REALTIME);
+#endif  // defined(__APPLE__)
+}
+
+// Returns the time since some arbitrary reference point, measured in microseconds.
+// Guaranteed to be monotonic (and therefore useful for measuring intervals),
+// but the underlying clock is subject for adjustment by adjtime() and
+// the kernel's NTP discipline. For example, the underlying clock might
+// be slewed a bit to reach some reference point, time to time adjusted to be
+// of the desired result frequency, etc.
+inline MicrosecondsInt64 GetMonoTimeMicros() {
+#if defined(__APPLE__)
+  // In fact, walltime_internal::GetMonoTimeMicros() is implemented via
+  // mach_absolute_time() which is not actually affected by adjtime()
+  // or the NTP discipline. On Darwin 16.0 and newer (macOS 10.12 and newer),
+  // it's the same as clock_gettime(CLOCK_UPTIME_RAW); see 'man clock_gettime'
+  // on macOS 10.12 and newer.
+  return walltime_internal::GetMonoTimeMicros();
+#else
+  return walltime_internal::GetClockTimeMicros(CLOCK_MONOTONIC);
+#endif  // defined(__APPLE__)
+}
+
+// Returns the time since some arbitrary reference point, measured in microseconds.
+// Guaranteed to be monotonic and not affected at all by frequency and time
+// adjustments such as adjtime() or the kernel's NTP discipline.
+inline MicrosecondsInt64 GetMonoTimeMicrosRaw() {
+#if defined(__APPLE__)
+  return walltime_internal::GetMonoTimeMicros();
+#else
+  return walltime_internal::GetClockTimeMicros(CLOCK_MONOTONIC_RAW);
+#endif  // defined(__APPLE__)
+}
+
+// Returns the time spent in user CPU on the current thread, measured in microseconds.
+inline MicrosecondsInt64 GetThreadCpuTimeMicros() {
+#if defined(__APPLE__)
+  return walltime_internal::GetThreadCpuTimeMicros();
+#else
+  return walltime_internal::GetClockTimeMicros(CLOCK_THREAD_CPUTIME_ID);
+#endif  // defined(__APPLE__)
+}
+
+// A CycleClock yields the value of a cycle counter that increments at a rate
+// that is approximately constant.
+class CycleClock {
+ public:
+  // Return the value of the counter.
+  static inline int64 Now();
+
+ private:
+  CycleClock();
+};
+
+// inline method bodies
+#include "gutil/cycleclock-inl.h"  // IWYU pragma: export
+#endif  // GUTIL_WALLTIME_H_