diff --git a/be/CMakeLists.txt b/be/CMakeLists.txt index 2638608579..08d2f81b7d 100644 --- a/be/CMakeLists.txt +++ b/be/CMakeLists.txt @@ -14,7 +14,28 @@ # under the License. cmake_minimum_required(VERSION 2.8.10) -project(palo) + +# set CMAKE_C_COMPILER, this must set before project command +if (DEFINED ENV{PALO_GCC_HOME}) + set(CMAKE_C_COMPILER "$ENV{PALO_GCC_HOME}/bin/gcc") + set(CMAKE_CXX_COMPILER "$ENV{PALO_GCC_HOME}/bin/g++") + set(GCC_HOME $ENV{PALO_GCC_HOME}) +else() + set(GCC_HOME "/usr") +endif() + +project(palo CXX C) + +# set CMAKE_BUILD_TYPE +if (DEFINED ENV{BUILD_TYPE}) + set(CMAKE_BUILD_TYPE $ENV{BUILD_TYPE}) +endif() +if (NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE RELEASE) +endif() + +string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE) +message(STATUS "Build type is ${CMAKE_BUILD_TYPE}") # Set dirs set(BASE_DIR "${CMAKE_CURRENT_SOURCE_DIR}") @@ -26,30 +47,24 @@ set(SRC_DIR "${BASE_DIR}/src/") set(TEST_DIR "${CMAKE_SOURCE_DIR}/test/") set(OUTPUT_DIR "${BASE_DIR}/output") +# LLVM +if (DEFINED ENV{PALO_LLVM_HOME}) + set(LLVM_HOME "$ENV{PALO_LLVM_HOME}") +else() + set(LLVM_HOME "${THIRDPARTY_DIR}") +endif() +set(LLVM_BIN "${LLVM_HOME}/bin") + option(MAKE_TEST "ON for make unit test or OFF for not" OFF) -# Set compiler -set(CMAKE_CXX_COMPILER $ENV{CXX}) -set(CMAKE_C_COMPILER $ENV{CC}) - # Check gcc -if (CMAKE_COMPILER_IS_GNUCC) - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion - OUTPUT_VARIABLE GCC_VERSION) - string(REGEX MATCHALL "[0-9]+" GCC_VERSION_COMPONENTS ${GCC_VERSION}) - list(GET GCC_VERSION_COMPONENTS 0 GCC_MAJOR) - list(GET GCC_VERSION_COMPONENTS 1 GCC_MINOR) - - message(STATUS "GCC version: ${GCC_VERSION}") - message(STATUS "GCC major version: ${GCC_MAJOR}") - message(STATUS "GCC minor version: ${GCC_MINOR}") - - if(GCC_VERSION VERSION_LESS "4.8.2") +if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS "4.8.2") message(FATAL_ERROR "Need GCC version at least 4.8.2") - endif(GCC_VERSION VERSION_LESS "4.8.2") + endif() else() message(FATAL_ERROR "Compiler should be GNU") -endif(CMAKE_COMPILER_IS_GNUCC) +endif() set(PIC_LIB_PATH "${THIRDPARTY_DIR}") if(PIC_LIB_PATH) @@ -69,6 +84,7 @@ else() set(LIBEVENT event) endif() + # Compile generated source if necessary message(STATUS "build gensrc if necessary") execute_process(COMMAND make -C ${BASE_DIR}/../gensrc/ @@ -82,8 +98,11 @@ set(Boost_DEBUG FALSE) set(Boost_USE_MULTITHREADED ON) set(BOOST_ROOT ${THIRDPARTY_DIR}) -find_package(Boost 1.55.0 REQUIRED COMPONENTS thread regex system filesystem date_time program_options) +find_package(Boost 1.55.0 REQUIRED COMPONENTS thread regex filesystem system date_time program_options) include_directories(${Boost_INCLUDE_DIRS}) +message(STATUS ${Boost_LIBRARIES}) + +set(GPERFTOOLS_HOME "${THIRDPARTY_DIR}") # Set all libraries add_library(gflags STATIC IMPORTED) @@ -96,10 +115,12 @@ add_library(re2 STATIC IMPORTED) set_target_properties(re2 PROPERTIES IMPORTED_LOCATION ${THIRDPARTY_DIR}/lib/libre2.a) add_library(pprof STATIC IMPORTED) -set_target_properties(pprof PROPERTIES IMPORTED_LOCATION ${THIRDPARTY_DIR}/lib/libprofiler.a) +set_target_properties(pprof PROPERTIES IMPORTED_LOCATION + ${GPERFTOOLS_HOME}/lib/libprofiler.a) add_library(tcmalloc STATIC IMPORTED) -set_target_properties(tcmalloc PROPERTIES IMPORTED_LOCATION ${THIRDPARTY_DIR}/lib/libtcmalloc.a) +set_target_properties(tcmalloc PROPERTIES IMPORTED_LOCATION + ${GPERFTOOLS_HOME}/lib/libtcmalloc.a) add_library(protobuf STATIC IMPORTED) set_target_properties(protobuf PROPERTIES IMPORTED_LOCATION ${THIRDPARTY_DIR}/lib/libprotobuf.a) @@ -138,7 +159,7 @@ add_library(libevent STATIC IMPORTED) set_target_properties(libevent PROPERTIES IMPORTED_LOCATION ${THIRDPARTY_DIR}/lib/libevent.a) add_library(LLVMSupport STATIC IMPORTED) -set_target_properties(LLVMSupport PROPERTIES IMPORTED_LOCATION ${THIRDPARTY_DIR}/lib/libLLVMSupport.a) +set_target_properties(LLVMSupport PROPERTIES IMPORTED_LOCATION ${LLVM_HOME}/lib/libLLVMSupport.a) add_library(crypto STATIC IMPORTED) set_target_properties(crypto PROPERTIES IMPORTED_LOCATION ${THIRDPARTY_DIR}/lib/libcrypto.a) @@ -146,11 +167,13 @@ set_target_properties(crypto PROPERTIES IMPORTED_LOCATION ${THIRDPARTY_DIR}/lib/ add_library(openssl STATIC IMPORTED) set_target_properties(openssl PROPERTIES IMPORTED_LOCATION ${THIRDPARTY_DIR}/lib/libssl.a) -find_program(THRIFT_COMPILER thrift ${CMAKE_SOURCE_DIR}/bin) +add_library(leveldb STATIC IMPORTED) +set_target_properties(leveldb PROPERTIES IMPORTED_LOCATION ${THIRDPARTY_DIR}/lib/libleveldb.a) -# LLVM -set(LLVM_BIN "${THIRDPARTY_DIR}/bin") -message(STATUS ${LLVM_HOME}) +add_library(brpc STATIC IMPORTED) +set_target_properties(brpc PROPERTIES IMPORTED_LOCATION ${THIRDPARTY_DIR}/lib64/libbrpc.a) + +find_program(THRIFT_COMPILER thrift ${CMAKE_SOURCE_DIR}/bin) # llvm-config find_program(LLVM_CONFIG_EXECUTABLE llvm-config @@ -211,10 +234,97 @@ execute_process( # Get the link libs we need. llvm has many and we don't want to link all of the libs # if we don't need them. execute_process( - COMMAND ${LLVM_CONFIG_EXECUTABLE} --libnames core jit native ipo bitreader target + COMMAND ${LLVM_CONFIG_EXECUTABLE} --libnames core mcjit native ipo bitreader target OUTPUT_VARIABLE LLVM_MODULE_LIBS OUTPUT_STRIP_TRAILING_WHITESPACE ) +# compiler flags that are common across debug/release builds +# -Wall: Enable all warnings. +# -Wno-sign-compare: suppress warnings for comparison between signed and unsigned +# integers +# -fno-strict-aliasing: disable optimizations that assume strict aliasing. This +# is unsafe to do if the code uses casts (which we obviously do). +# -Wno-unknown-pragmas: suppress warnings for unknown (compiler specific) pragmas +# -Wno-deprecated: gutil contains deprecated headers +# -Wno-vla: we use C99-style variable-length arrays +# -pthread: enable multithreaded malloc +# -DBOOST_DATE_TIME_POSIX_TIME_STD_CONFIG: enable nanosecond precision for boost +# -fno-omit-frame-pointers: Keep frame pointer for functions in register +set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wall -Wno-sign-compare -Wno-unknown-pragmas -pthread") +set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -fno-strict-aliasing -fno-omit-frame-pointer") +set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -std=gnu++11") +set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-deprecated -Wno-vla") +set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -DBOOST_DATE_TIME_POSIX_TIME_STD_CONFIG") +set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -DBOOST_SYSTEM_NO_DEPRECATED") +set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -msse4.2 -D_GLIBCXX_USE_CXX11_ABI=0") +set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -DLLVM_ON_UNIX") + +# for bprc +if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0) + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -faligned-new") +endif() + +# For any gcc builds: +# -g: Enable symbols for profiler tools +# -Wno-unused-local-typedefs: Do not warn for local typedefs that are unused. +set(CXX_GCC_FLAGS "-g -Wno-unused-local-typedefs") + +# For CMAKE_BUILD_TYPE=Debug +# -ggdb: Enable gdb debugging +# Debug information is stored as dwarf2 to be as compatible as possible +# -Werror: compile warnings should be errors when using the toolchain compiler. +# Only enable for debug builds because this is what we test in pre-commit tests. +set(CXX_FLAGS_DEBUG "${CXX_GCC_FLAGS} -Werror -ggdb") + +# For CMAKE_BUILD_TYPE=Release +# -O3: Enable all compiler optimizations +# -DNDEBUG: Turn off dchecks/asserts/debug only code. +# -gdwarf-2: Debug information is stored as dwarf2 to be as compatible as possible +set(CXX_FLAGS_RELEASE "${CXX_GCC_FLAGS} -O3 -gdwarf-2") + +SET(CXX_FLAGS_ASAN "${CXX_GCC_FLAGS} -O1 -fsanitize=address -DADDRESS_SANITIZER") +SET(CXX_FLAGS_LSAN "${CXX_GCC_FLAGS} -O1 -fsanitize=leak -DLEAK_SANITIZER") + +# Set the flags to the undefined behavior sanitizer, also known as "ubsan" +# Turn on sanitizer and debug symbols to get stack traces: +SET(CXX_FLAGS_UBSAN "${CXX_GCC_FLAGS} -ggdb3 -fsanitize=undefined") +# Ignore a number of noisy errors with too many false positives: +# TODO(zc): +# SET(CXX_FLAGS_UBSAN "${CXX_FLAGS_UBSAN} -fno-sanitize=alignment,function,vptr,float-divide-by-zero,float-cast-overflow") +# Don't enforce wrapped signed integer arithmetic so that the sanitizer actually sees +# undefined wrapping: +SET(CXX_FLAGS_UBSAN "${CXX_FLAGS_UBSAN} -fno-wrapv") +# To ease debugging, turn off all optimizations: +SET(CXX_FLAGS_UBSAN "${CXX_FLAGS_UBSAN} -O0") + +# Set the flags to the thread sanitizer, also known as "tsan" +# Turn on sanitizer and debug symbols to get stack traces: +SET(CXX_FLAGS_TSAN "${CXX_GCC_FLAGS} -O1 -ggdb3 -fsanitize=thread -DTHREAD_SANITIZER") + +# Set compile flags based on the build type. +if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG") + SET(CMAKE_CXX_FLAGS ${CXX_FLAGS_DEBUG}) +elseif ("${CMAKE_BUILD_TYPE}" STREQUAL "RELEASE") + SET(CMAKE_CXX_FLAGS ${CXX_FLAGS_RELEASE}) +elseif ("${CMAKE_BUILD_TYPE}" STREQUAL "ASAN") + SET(CMAKE_CXX_FLAGS "${CXX_FLAGS_ASAN}") +elseif ("${CMAKE_BUILD_TYPE}" STREQUAL "LSAN") + SET(CMAKE_CXX_FLAGS "${CXX_FLAGS_LSAN}") +elseif ("${CMAKE_BUILD_TYPE}" STREQUAL "UBSAN") + SET(CMAKE_CXX_FLAGS "${CXX_FLAGS_UBSAN}") +elseif ("${CMAKE_BUILD_TYPE}" STREQUAL "TSAN") + SET(CMAKE_CXX_FLAGS "${CXX_FLAGS_TSAN}") +else() + message(FATAL_ERROR "Unknown build type: ${CMAKE_BUILD_TYPE}") +endif() + +# Add flags that are common across build types +SET(CMAKE_CXX_FLAGS "${CXX_COMMON_FLAGS} ${CMAKE_CXX_FLAGS}") + +message(STATUS "Compiler Flags: ${CMAKE_CXX_FLAGS}") + +# Thrift requires these two definitions for some types that we use +add_definitions(-DHAVE_INTTYPES_H -DHAVE_NETINET_IN_H) # TODO: this does not work well. the config file will output -I/ and # also -DNDEBUG. I've hard coded the #define that are necessary but we should make @@ -227,7 +337,7 @@ execute_process( #) set(LLVM_CFLAGS "-D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS") -if(GCC_VERSION VERSION_LESS "5.0.0") +if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "5.0.0") message(STATUS "GCC version is less than 5.0.0, no need to set -D__GLIBCXX_BITSIZE_INT_N_0=128 and -D__GLIBCXX_TYPE_INT_N_0=__int128") else() SET(LLVM_CFLAGS "${LLVM_LFLAGS} -D__GLIBCXX_BITSIZE_INT_N_0=128 -D__GLIBCXX_TYPE_INT_N_0=__int128") @@ -238,19 +348,21 @@ endif() # Note that we don't enable any optimization. We want unoptimized IR since we will be # modifying it at runtime, then re-compiling (and optimizing) the modified code. The final # optimizations will be less effective if the initial code is also optimized. +set(CLANG_IR_CXX_FLAGS "-gcc-toolchain" ${GCC_HOME}) +set(CLANG_IR_CXX_FLAGS ${CLANG_IR_CXX_FLAGS} "-std=gnu++11" "-c" "-emit-llvm" "-D__STDC_CONSTANT_MACROS" "-D__STDC_FORMAT_MACROS" "-D__STDC_LIMIT_MACROS" "-DIR_COMPILE" "-DNDEBUG" "-DHAVE_INTTYPES_H" "-DHAVE_NETINET_IN_H" "-DBOOST_DATE_TIME_POSIX_TIME_STD_CONFIG" "-D__GLIBCXX_BITSIZE_INT_N_0=128" "-D__GLIBCXX_TYPE_INT_N_0=__int128" "-U_GLIBCXX_USE_FLOAT128" "-DLLVM_ON_UNIX") -if(DEFINED ENV{GCC_TOOLCHAIN}) - set(CLANG_IR_CXX_FLAGS $ENV{GCC_TOOLCHAIN}) +if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0) + # for support float128 + set(CLANG_IR_CXX_FLAGS ${CLANG_IR_CXX_FLAGS} "-D__STRICT_ANSI__") endif() -set(CLANG_IR_CXX_FLAGS ${CLANG_IR_CXX_FLAGS} "-std=gnu++11" "-c" "-emit-llvm" "-D__STDC_CONSTANT_MACROS" "-D__STDC_FORMAT_MACROS" "-D__STDC_LIMIT_MACROS" "-DIR_COMPILE" "-DNDEBUG" "-DHAVE_INTTYPES_H" "-DHAVE_NETINET_IN_H" "-DBOOST_DATE_TIME_POSIX_TIME_STD_CONFIG" "-D__GLIBCXX_BITSIZE_INT_N_0=128" "-D__GLIBCXX_TYPE_INT_N_0=__int128" "-U_GLIBCXX_USE_FLOAT128") message(STATUS "CLANG_IR_CXX_FLAGS: ${CLANG_IR_CXX_FLAGS}") # CMake really doesn't like adding link directories and wants absolute paths # Reconstruct it with LLVM_MODULE_LIBS and LLVM_LIBRARY_DIR -string(REPLACE " " ";" LIBS_LIST ${LLVM_MODULE_LIBS}) -set (LLVM_MODULE_LIBS "-ldl") -foreach (LIB ${LIBS_LIST}) +string(REPLACE " " ";" LLVM_LIBS_LIST ${LLVM_MODULE_LIBS}) +set (LLVM_MODULE_LIBS "") +foreach (LIB ${LLVM_LIBS_LIST}) set(LLVM_MODULE_LIBS ${LLVM_MODULE_LIBS} "${LLVM_LIBRARY_DIR}/${LIB}") endforeach(LIB) @@ -272,33 +384,35 @@ execute_process(COMMAND lsb_release -si OUTPUT_VARIABLE LINUX_VERSION) string(TOLOWER ${LINUX_VERSION} LINUX_VERSION_LOWER) message(STATUS "${LINUX_VERSION_LOWER}") -if(DEFINED ENV{CLANG_BASE_FLAGS}) - set(CLANG_BASE_FLAGS - $ENV{CLANG_BASE_FLAGS}) -elseif(${LINUX_VERSION_LOWER} MATCHES "ubuntu") - set(CLANG_BASE_FLAGS - "-I/usr/include/c++/5/" - "-I/usr/include/x86_64-linux-gnu/c++/5/") -elseif(${LINUX_VERSION_LOWER} MATCHES "centos") - set(CLANG_BASE_FLAGS - "-I/usr/include/c++/4.8.5/" - "-I/usr/include/c++/4.8.5/x86_64-redhat-linux/") -elseif(${LINUX_VERSION_LOWER} MATCHES "fedora") - set(CLANG_BASE_FLAGS - "-I/usr/include/c++/7/" - "-I/usr/include/c++/7/x86_64-redhat-linux/") -else() - message(FATAL_ERROR "Currently not support system ${LINUX_VERSION}") -endif() +# if(DEFINED ENV{CLANG_BASE_FLAGS}) +# set(CLANG_BASE_FLAGS +# $ENV{CLANG_BASE_FLAGS}) +# elseif(${LINUX_VERSION_LOWER} MATCHES "ubuntu") +# set(CLANG_BASE_FLAGS +# "-I/usr/include/c++/5/" +# "-I/usr/include/x86_64-linux-gnu/c++/5/") +# elseif(${LINUX_VERSION_LOWER} MATCHES "centos") +# set(CLANG_BASE_FLAGS +# "-I/usr/include/c++/4.8.5/" +# "-I/usr/include/c++/4.8.5/x86_64-redhat-linux/") +# elseif(${LINUX_VERSION_LOWER} MATCHES "fedora") +# set(CLANG_BASE_FLAGS +# "-I/usr/include/c++/7/" +# "-I/usr/include/c++/7/x86_64-redhat-linux/") +# else() +# message(FATAL_ERROR "Currently not support system ${LINUX_VERSION}") +# endif() message(STATUS "CLANG_BASE_FLAGS: ${CLANG_BASE_FLAGS}") set(CLANG_INCLUDE_FLAGS + "-I${LLVM_HOME}/include" "-I${BASE_DIR}/src" "-I${GENSRC_DIR}" "-I${THIRDPARTY_DIR}/include" "-I${THIRDPARTY_DIR}/include/thrift/" "-I${THIRDPARTY_DIR}/include/event/" + "-I${GPERFTOOLS_HOME}/include" ${CLANG_BASE_FLAGS} ) @@ -308,17 +422,18 @@ include_directories( ${SRC_DIR}/ ${TEST_DIR}/ ${GENSRC_DIR}/ - ${THIRDPARTY_DIR}/include/ + ${THIRDPARTY_DIR}/include + ${GPERFTOOLS_HOME}/include ${THIRDPARTY_DIR}/include/thrift/ ${THIRDPARTY_DIR}/include/event/ ) -# Set libraries + set(WL_START_GROUP "-Wl,--start-group") set(WL_END_GROUP "-Wl,--end-group") # Set Palo libraries -set (PALO_LINK_LIBS +set(PALO_LINK_LIBS ${WL_START_GROUP} Agent CodeGen @@ -340,35 +455,57 @@ set (PALO_LINK_LIBS ) # Set thirdparty libraries -set (PALO_LINK_LIBS ${PALO_LINK_LIBS} - protobuf +set(PALO_DEPENDENCIES lzo snappy ${Boost_LIBRARIES} ${LLVM_MODULE_LIBS} - # popt thrift thriftnb - ${WL_START_GROUP} glog - gflags re2 pprof - tcmalloc lz4 libevent - ${LIBZ} - ${LIBBZ2} mysql curl - ${WL_END_GROUP} - -lrt - -lbfd - -liberty + ${WL_START_GROUP} + ${LIBZ} + ${LIBBZ2} + gflags + brpc + protobuf openssl crypto - #-fsanitize=address - #-lboost_date_time + ${WL_START_GROUP} + leveldb +) + +# Add all external dependencies. They should come after the palo libs. +# static link gcc's lib +set(PALO_LINK_LIBS ${PALO_LINK_LIBS} + ${PALO_DEPENDENCIES} + -static-libstdc++ + -static-libgcc +) + +# Add sanitize static link flags or tcmalloc +if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG" OR "${CMAKE_BUILD_TYPE}" STREQUAL "RELEASE") + set(PALO_LINK_LIBS ${PALO_LINK_LIBS} tcmalloc) +elseif ("${CMAKE_BUILD_TYPE}" STREQUAL "ASAN") + set(PALO_LINK_LIBS ${PALO_LINK_LIBS} -static-libasan) +elseif ("${CMAKE_BUILD_TYPE}" STREQUAL "LSAN") + set(PALO_LINK_LIBS ${PALO_LINK_LIBS} -static-liblsan) +elseif ("${CMAKE_BUILD_TYPE}" STREQUAL "UBSAN") + set(PALO_LINK_LIBS ${PALO_LINK_LIBS} -static-libubsan tcmalloc) +elseif ("${CMAKE_BUILD_TYPE}" STREQUAL "TSAN") + set(PALO_LINK_LIBS ${PALO_LINK_LIBS} -static-libtsan) +else() + message(FATAL_ERROR "Unknown build type: ${CMAKE_BUILD_TYPE}") +endif() + +set(PALO_LINK_LIBS ${PALO_LINK_LIBS} + -lrt -lbfd -liberty -lc -lm -ldl -pthread -lz ) # Set libraries for test @@ -380,22 +517,6 @@ set (TEST_LINK_LIBS ${PALO_LINK_LIBS} ${WL_END_GROUP} ) -# Set CXX flags -SET(CXX_COMMON_FLAGS "-msse4.2 -Wall -Wno-sign-compare -Wno-deprecated -pthread -fno-omit-frame-pointer") -SET(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -DBOOST_DATE_TIME_POSIX_TIME_STD_CONFIG -D__STDC_FORMAT_MACROS") - -# Add by zhaochun: use gnu++11 for make_unsigned<__int128> -SET(CMAKE_CXX_FLAGS "-g -ggdb -O2 -Wno-unused-local-typedefs -Wno-strict-aliasing -std=gnu++11 -D_FILE_OFFSET_BITS=64") - -# use address sanitizer, commented the malloc in ld flags -# SET(CMAKE_CXX_FLAGS "-g -ggdb -Wno-unused-local-typedefs -Wno-strict-aliasing -std=gnu++11 -fsanitize=address -fno-omit-frame-pointer -DADDRESS_SANITIZER") -SET(CMAKE_CXX_FLAGS "${CXX_COMMON_FLAGS} ${CMAKE_CXX_FLAGS}") - -MESSAGE(STATUS "Compiler Flags: ${CMAKE_CXX_FLAGS}") - -# Thrift requires these two definitions for some types that we use -add_definitions(-DHAVE_INTTYPES_H -DHAVE_NETINET_IN_H) - # Only build static libs set(BUILD_SHARED_LIBS OFF) @@ -432,7 +553,7 @@ FUNCTION(ADD_BE_TEST TEST_NAME) ADD_EXECUTABLE(${TEST_FILE_NAME} ${TEST_NAME}.cpp) TARGET_LINK_LIBRARIES(${TEST_FILE_NAME} ${TEST_LINK_LIBS}) - SET_TARGET_PROPERTIES(${TEST_FILE_NAME} PROPERTIES COMPILE_FLAGS "-Dprivate=public -Dprotected=public") + SET_TARGET_PROPERTIES(${TEST_FILE_NAME} PROPERTIES COMPILE_FLAGS "-fno-access-control") if (NOT "${TEST_DIR_NAME}" STREQUAL "") SET_TARGET_PROPERTIES(${TEST_FILE_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_OUTPUT_ROOT_DIRECTORY}/${DIR_NAME}/${TEST_DIR_NAME}") endif() @@ -448,6 +569,7 @@ if (${MAKE_TEST} STREQUAL "ON") add_subdirectory(${TEST_DIR}/exec) add_subdirectory(${TEST_DIR}/exprs) add_subdirectory(${TEST_DIR}/runtime) + add_subdirectory(${TEST_DIR}/http) endif () # Install be diff --git a/be/src/agent/heartbeat_server.cpp b/be/src/agent/heartbeat_server.cpp index 1823b13c45..3ccab0bc8b 100644 --- a/be/src/agent/heartbeat_server.cpp +++ b/be/src/agent/heartbeat_server.cpp @@ -119,6 +119,7 @@ void HeartbeatServer::heartbeat( backend_info.__set_be_port(config::be_port); backend_info.__set_http_port(config::webserver_port); backend_info.__set_be_rpc_port(config::be_rpc_port); + backend_info.__set_brpc_port(config::brpc_port); } else { status_code = TStatusCode::RUNTIME_ERROR; } diff --git a/be/src/agent/task_worker_pool.cpp b/be/src/agent/task_worker_pool.cpp index f116180021..62fe5446e7 100644 --- a/be/src/agent/task_worker_pool.cpp +++ b/be/src/agent/task_worker_pool.cpp @@ -153,7 +153,7 @@ void TaskWorkerPool::start() { _callback_function = _report_disk_state_worker_thread_callback; break; case TaskWorkerType::REPORT_OLAP_TABLE: - _wait_duration = boost::posix_time::time_duration(0, 0, config::report_olap_table_interval_seconds, 0); + _wait_duration = boost::posix_time::time_duration(0, 0, config::report_disk_state_interval_seconds, 0); _worker_count = REPORT_OLAP_TABLE_WORKER_COUNT; _callback_function = _report_olap_table_worker_thread_callback; break; @@ -268,7 +268,9 @@ void TaskWorkerPool::_spawn_callback_worker_thread(CALLBACK_FUNCTION callback_fu err = pthread_create(&thread, NULL, callback_func, this); if (err != 0) { OLAP_LOG_WARNING("failed to spawn a thread. error: %d", err); +#ifndef BE_TEST sleep(config::sleep_one_second); +#endif } else { pthread_detach(thread); break; @@ -742,7 +744,7 @@ void* TaskWorkerPool::_push_worker_thread_callback(void* arg_this) { OLAPStatus delete_data_status = worker_pool_this->_command_executor->delete_data(push_req, &tablet_infos); if (delete_data_status != OLAPStatus::OLAP_SUCCESS) { - OLAP_LOG_WARNING("delet data failed. statusta: %d, signature: %ld", + OLAP_LOG_WARNING("delete data failed. status: %d, signature: %ld", delete_data_status, agent_task_req.signature); status = PALO_ERROR; } @@ -1114,7 +1116,9 @@ AgentStatus TaskWorkerPool::_clone_copy( downloader_param.remote_file_path.c_str(), signature); ++download_retry_time; +#ifndef BE_TEST sleep(download_retry_time); +#endif } else { break; } @@ -1200,7 +1204,9 @@ AgentStatus TaskWorkerPool::_clone_copy( downloader_param.remote_file_path.c_str(), signature); ++download_retry_time; +#ifndef BE_TEST sleep(download_retry_time); +#endif } else { break; } @@ -1268,7 +1274,9 @@ AgentStatus TaskWorkerPool::_clone_copy( } } ++download_retry_time; +#ifndef BE_TEST sleep(download_retry_time); +#endif } // Try to download a file from remote backend #ifndef BE_TEST @@ -1524,7 +1532,6 @@ void* TaskWorkerPool::_report_disk_state_worker_thread_callback(void* arg_this) #ifndef BE_TEST while (true) { -#endif if (worker_pool_this->_master_info.network_address.port == 0) { // port == 0 means not received heartbeat yet // sleep a short time and try again @@ -1532,6 +1539,7 @@ void* TaskWorkerPool::_report_disk_state_worker_thread_callback(void* arg_this) sleep(config::sleep_one_second); continue; } +#endif vector root_paths_stat; @@ -1585,7 +1593,6 @@ void* TaskWorkerPool::_report_olap_table_worker_thread_callback(void* arg_this) #ifndef BE_TEST while (true) { -#endif if (worker_pool_this->_master_info.network_address.port == 0) { // port == 0 means not received heartbeat yet // sleep a short time and try again @@ -1593,6 +1600,7 @@ void* TaskWorkerPool::_report_olap_table_worker_thread_callback(void* arg_this) sleep(config::sleep_one_second); continue; } +#endif request.tablets.clear(); diff --git a/be/src/codegen/CMakeLists.txt b/be/src/codegen/CMakeLists.txt index dfa1e444bb..4b899c5367 100644 --- a/be/src/codegen/CMakeLists.txt +++ b/be/src/codegen/CMakeLists.txt @@ -70,7 +70,7 @@ add_custom_command( COMMAND ${LLVM_CLANG_EXECUTABLE} ${CLANG_IR_CXX_FLAGS} "-msse4.2" ${CLANG_INCLUDE_FLAGS} ${IR_INPUT_FILES} -o ${IR_SSE_TMP_OUTPUT_FILE} COMMAND ${LLVM_OPT_EXECUTABLE} --instnamer < ${IR_SSE_TMP_OUTPUT_FILE} > ${IR_SSE_OUTPUT_FILE} COMMAND rm ${IR_SSE_TMP_OUTPUT_FILE} - DEPENDS Util Exec Exprs Udf ${IR_INPUT_FILES} + DEPENDS Exec Exprs Udf ${IR_INPUT_FILES} ) # Compile without sse enabled. @@ -79,7 +79,7 @@ add_custom_command( COMMAND ${LLVM_CLANG_EXECUTABLE} ${CLANG_IR_CXX_FLAGS} ${CLANG_INCLUDE_FLAGS} ${IR_INPUT_FILES} -o ${IR_NO_SSE_TMP_OUTPUT_FILE} COMMAND ${LLVM_OPT_EXECUTABLE} --instnamer < ${IR_NO_SSE_TMP_OUTPUT_FILE} > ${IR_NO_SSE_OUTPUT_FILE} COMMAND rm ${IR_NO_SSE_TMP_OUTPUT_FILE} - DEPENDS Util Exec Exprs Udf ${IR_INPUT_FILES} + DEPENDS Exec Exprs Udf ${IR_INPUT_FILES} ) add_custom_target(compile_to_ir_sse DEPENDS ${IR_SSE_OUTPUT_FILE}) diff --git a/be/src/codegen/llvm_codegen.h b/be/src/codegen/llvm_codegen.h index d35a867569..a25283652a 100644 --- a/be/src/codegen/llvm_codegen.h +++ b/be/src/codegen/llvm_codegen.h @@ -49,11 +49,11 @@ class BasicBlock; class ConstantFolder; class ExecutionEngine; class Function; -class FunctionPassManager; +// class FunctionPassManager; class LLVMContext; class Module; class NoFolder; -class PassManager; +// class PassManager; class PointerType; class StructType; class TargetData; diff --git a/be/src/common/compiler_util.h b/be/src/common/compiler_util.h index c51a94ef75..9a113956d7 100644 --- a/be/src/common/compiler_util.h +++ b/be/src/common/compiler_util.h @@ -26,6 +26,8 @@ // about memory" paper. // example: if (LIKELY(size > 0)) { ... } // example: if (UNLIKELY(!status.ok())) { ... } +#define CACHE_LINE_SIZE 64 + #ifdef LIKELY #undef LIKELY #endif @@ -45,5 +47,7 @@ /// decision, e.g. not inlining a small function on a hot path. #define ALWAYS_INLINE __attribute__((always_inline)) +#define ALIGN_CACHE_LINE __attribute__ ((aligned (CACHE_LINE_SIZE))) + #endif diff --git a/be/src/common/config.h b/be/src/common/config.h index 50c02213a6..f382c18aa1 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -31,6 +31,9 @@ namespace config { CONF_Int32(be_port, "9060"); CONF_Int32(be_rpc_port, "10060"); + // port for brpc + CONF_Int32(brpc_port, "8060"); + // Declare a selection strategy for those servers have many ips. // Note that there should at most one ip match this list. // this is a list in semicolon-delimited format, in CIDR notation, e.g. 10.10.10.0/24 @@ -187,7 +190,6 @@ namespace config { CONF_Int32(sorter_block_size, "8388608"); // push_write_mbytes_per_sec CONF_Int32(push_write_mbytes_per_sec, "10"); - CONF_Int32(base_expansion_write_mbytes_per_sec, "5"); CONF_Int64(column_dictionary_key_ration_threshold, "0"); CONF_Int64(column_dictionary_key_size_threshold, "0"); @@ -201,8 +203,6 @@ namespace config { CONF_Int64(max_unpacked_row_block_size, "104857600"); CONF_Int32(file_descriptor_cache_clean_interval, "3600"); - CONF_Int32(base_expansion_trigger_interval, "1"); - CONF_Int32(cumulative_check_interval, "1"); CONF_Int32(disk_stat_monitor_interval, "5"); CONF_Int32(unused_index_monitor_interval, "30"); CONF_String(storage_root_path, "${PALO_HOME}/storage"); @@ -219,23 +219,28 @@ namespace config { CONF_Int32(disk_capacity_insufficient_percentage, "90"); // check row nums for BE/CE and schema change. true is open, false is closed. CONF_Bool(row_nums_check, "true") - // be policy - CONF_Int32(base_expansion_thread_num, "1"); - CONF_Int64(be_policy_start_time, "20"); - CONF_Int64(be_policy_end_time, "7"); //file descriptors cache, by default, cache 30720 descriptors CONF_Int32(file_descriptor_cache_capacity, "30720"); CONF_Int64(index_stream_cache_capacity, "10737418240"); CONF_Int64(max_packed_row_block_size, "20971520"); - CONF_Int32(cumulative_write_mbytes_per_sec, "100"); - CONF_Int64(ce_policy_delta_files_number, "5"); - // ce policy: max delta file's size unit:B - CONF_Int32(cumulative_thread_num, "1"); - CONF_Int64(ce_policy_max_delta_file_size, "104857600"); - CONF_Int64(be_policy_cumulative_files_number, "5"); - CONF_Double(be_policy_cumulative_base_ratio, "0.3"); - CONF_Int64(be_policy_be_interval_seconds, "604800"); - CONF_Int32(cumulative_source_overflow_ratio, "5"); + + // be policy + CONF_Int64(base_compaction_start_hour, "20"); + CONF_Int64(base_compaction_end_hour, "7"); + CONF_Int32(base_compaction_check_interval_seconds, "60"); + CONF_Int64(base_compaction_num_cumulative_deltas, "5"); + CONF_Int32(base_compaction_num_threads, "1"); + CONF_Double(base_cumulative_delta_ratio, "0.3"); + CONF_Int64(base_compaction_interval_seconds_since_last_operation, "604800"); + CONF_Int32(base_compaction_write_mbytes_per_sec, "5"); + + // cumulative compaction policy: max delta file's size unit:B + CONF_Int32(cumulative_compaction_check_interval_seconds, "10"); + CONF_Int64(cumulative_compaction_num_singleton_deltas, "5"); + CONF_Int32(cumulative_compaction_num_threads, "1"); + CONF_Int64(cumulative_compaction_budgeted_bytes, "104857600"); + CONF_Int32(cumulative_compaction_write_mbytes_per_sec, "100"); + CONF_Int32(delete_delta_expire_time, "1440"); // Port to start debug webserver on CONF_Int32(webserver_port, "8040"); @@ -296,7 +301,8 @@ namespace config { // for partition CONF_Bool(enable_partitioned_hash_join, "false") CONF_Bool(enable_partitioned_aggregation, "false") - + CONF_Bool(enable_new_partitioned_aggregation, "true") + // for kudu // "The maximum size of the row batch queue, for Kudu scanners." CONF_Int32(kudu_max_row_batches, "0") @@ -320,6 +326,51 @@ namespace config { // to forward compatibility, will be removed later CONF_Bool(enable_token_check, "true"); + + // to open/close system metrics + CONF_Bool(enable_system_metrics, "true"); + + CONF_Bool(enable_prefetch, "true"); + + // cpu count + CONF_Int32(flags_num_cores, "32"); + + CONF_Bool(FLAGS_thread_creation_fault_injection, "false"); + + // Set this to encrypt and perform an integrity + // check on all data spilled to disk during a query + CONF_Bool(FLAGS_disk_spill_encryption, "false"); + + // Writable scratch directories + CONF_String(FLAGS_scratch_dirs, "/tmp"); + + // If false and --scratch_dirs contains multiple directories on the same device, + // then only the first writable directory is used + CONF_Bool(FLAGS_allow_multiple_scratch_dirs_per_device, "false"); + + // linux transparent huge page + CONF_Bool(FLAGS_madvise_huge_pages, "false"); + + // whether use mmap to allocate memory + CONF_Bool(FLAGS_mmap_buffers, "false"); + + // whether or not user mem pool + CONF_Bool(FLAGS_disable_mem_pools, "false"); + + // max memory can be allocated by buffer pool + CONF_String(FLAGS_buffer_pool_limit, "80G"); + + // clean page can be hold by buffer pool + CONF_String(FLAGS_buffer_pool_clean_pages_limit, "20G"); + + // buffer pool can support min memory allocated + CONF_Int32(FLAGS_min_buffer_size, "1024"); + + // Sleep time in seconds between memory maintenance iterations + CONF_Int64(FLAGS_memory_maintenance_sleep_time_s, "10"); + + // Aligement + CONF_Int32(FLAGS_MEMORY_MAX_ALIGNMENT, "16"); } // namespace config } // namespace palo diff --git a/be/src/common/daemon.cpp b/be/src/common/daemon.cpp index 7cd5bb5dcd..82f5b8e105 100644 --- a/be/src/common/daemon.cpp +++ b/be/src/common/daemon.cpp @@ -20,7 +20,10 @@ #include "common/daemon.h" +#include #include + +#include "common/config.h" #include "util/cpu_info.h" #include "util/debug_util.h" #include "util/disk_info.h" @@ -28,6 +31,10 @@ #include "util/mem_info.h" #include "util/network_util.h" #include "util/thrift_util.h" +#include "util/palo_metrics.h" +#include "runtime/bufferpool/buffer_pool.h" +#include "runtime/exec_env.h" +#include "runtime/mem_tracker.h" #include "runtime/lib_cache.h" #include "exprs/operators.h" #include "exprs/is_null_predicate.h" @@ -43,6 +50,7 @@ #include "exprs/utility_functions.h" #include "exprs/json_functions.h" #include "exprs/hll_hash_function.h" +#include "olap/olap_rootpath.h" namespace palo { @@ -52,16 +60,15 @@ void* tcmalloc_gc_thread(void* dummy) { size_t used_size = 0; size_t free_size = 0; -#ifndef ADDRESS_SANITIZER +#if !defined(ADDRESS_SANITIZER) && !defined(LEAK_SANITIZER) && !defined(THREAD_SANITIZER) MallocExtension::instance()->GetNumericProperty("generic.current_allocated_bytes", &used_size); MallocExtension::instance()->GetNumericProperty("tcmalloc.pageheap_free_bytes", &free_size); #endif size_t alloc_size = used_size + free_size; if (alloc_size > config::tc_use_memory_min) { +#if !defined(ADDRESS_SANITIZER) && !defined(LEAK_SANITIZER) && !defined(THREAD_SANITIZER) size_t max_free_size = alloc_size * config::tc_free_memory_rate / 100; - -#ifndef ADDRESS_SANITIZER if (free_size > max_free_size) { MallocExtension::instance()->ReleaseToSystem(free_size - max_free_size); } @@ -71,10 +78,66 @@ void* tcmalloc_gc_thread(void* dummy) { return NULL; } + +void* memory_maintenance_thread(void* dummy) { + while (true) { + sleep(config::FLAGS_memory_maintenance_sleep_time_s); + ExecEnv* env = ExecEnv::GetInstance(); + // ExecEnv may not have been created yet or this may be the catalogd or statestored, + // which don't have ExecEnvs. + if (env != nullptr) { + BufferPool* buffer_pool = env->buffer_pool(); + if (buffer_pool != nullptr) buffer_pool->Maintenance(); + + // The process limit as measured by our trackers may get out of sync with the + // process usage if memory is allocated or freed without updating a MemTracker. + // The metric is refreshed whenever memory is consumed or released via a MemTracker, + // so on a system with queries executing it will be refreshed frequently. However + // if the system is idle, we need to refresh the tracker occasionally since + // untracked memory may be allocated or freed, e.g. by background threads. + if (env->process_mem_tracker() != nullptr && + !env->process_mem_tracker()->is_consumption_metric_null()) { + env->process_mem_tracker()->RefreshConsumptionFromMetric(); + } + } + } + + return NULL; +} + +static void init_palo_metrics() { + bool init_system_metrics = config::enable_system_metrics; + std::set disk_devices; + std::vector network_interfaces; + if (init_system_metrics) { + std::vector paths; + std::vector capacities; + auto res = OLAPRootPath::parse_root_paths_from_string( + config::storage_root_path.c_str(), &paths, &capacities); + if (res != OLAP_SUCCESS) { + LOG(WARNING) << "parse storage_root_path failed, res=" << res + << ", path=" << config::storage_root_path; + return; + } + auto st = DiskInfo::get_disk_devices(paths, &disk_devices); + if (!st.ok()) { + LOG(WARNING) << "get disk devices failed, stauts=" << st.get_error_msg(); + return; + } + st = get_inet_interfaces(&network_interfaces); + if (!st.ok()) { + LOG(WARNING) << "get inet interfaces failed, stauts=" << st.get_error_msg(); + return; + } + } + PaloMetrics::instance()->initialize( + "palo_be", init_system_metrics, disk_devices, network_interfaces); +} void init_daemon(int argc, char** argv) { // google::SetVersionString(get_build_version(false)); // google::ParseCommandLineFlags(&argc, &argv, true); + google::ParseCommandLineFlags(&argc, &argv, true); init_glog("be", true); LOG(INFO) << get_version_string(false); @@ -99,12 +162,16 @@ void init_daemon(int argc, char** argv) { JsonFunctions::init(); HllHashFunctions::init(); - pthread_t id; - pthread_create(&id, NULL, tcmalloc_gc_thread, NULL); + pthread_t tc_malloc_pid; + pthread_create(&tc_malloc_pid, NULL, tcmalloc_gc_thread, NULL); + pthread_t buffer_pool_pid; + pthread_create(&buffer_pool_pid, NULL, memory_maintenance_thread, NULL); + LOG(INFO) << CpuInfo::debug_string(); LOG(INFO) << DiskInfo::debug_string(); LOG(INFO) << MemInfo::debug_string(); + init_palo_metrics(); } } diff --git a/be/src/common/hdfs.h b/be/src/common/hdfs.h index 459277c72d..a2390164a1 100644 --- a/be/src/common/hdfs.h +++ b/be/src/common/hdfs.h @@ -18,8 +18,8 @@ // specific language governing permissions and limitations // under the License. -#ifndef IMPALA_COMMON_HDFS_H -#define IMPALA_COMMON_HDFS_H +#ifndef BDG_PALO_BE_SRC_COMMON_COMMON_HDFS_H +#define BDG_PALO_BE_SRC_COMMON_COMMON_HDFS_H // This is a wrapper around the hdfs header. When we are compiling to IR, // we don't want to pull in the hdfs headers. We only need the headers diff --git a/be/src/common/names.h b/be/src/common/names.h new file mode 100755 index 0000000000..a0ffff58a8 --- /dev/null +++ b/be/src/common/names.h @@ -0,0 +1,194 @@ +// Modifications copyright (C) 2017, Baidu.com, Inc. +// Copyright 2017 The Apache Software Foundation + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/// The motivation for the using declarations below is to allow accessing the most +/// relevant and most frequently used library classes without having to explicitly pull +/// them into the global namespace. The goal is that when readers sees a usage of vector +/// (etc.) without any further specialization they can rely on the fact that it will be a +/// std::vector. +// +/// Instead of actually including the header files for the symbols, this file only checks +/// if certain include guards are defined before applying the using declaration. This +/// makes sure that including this file has no impact on the compile time. +// +/// Please make sure that this file is included last in the cc file's include list to make +/// sure that all relevant include guards are defined. +// +/// The content of this file is manually curated and should only be changed on rare +/// occasions. +#include + +#ifdef _GLIBCXX_VECTOR +using std::vector; +#endif + +#ifdef _GLIBCXX_MAP +using std::map; +using std::multimap; +#endif + +#ifdef _GLIBCXX_LIST +using std::list; +#endif + +#ifdef _GLIBCXX_SET +using std::set; +using std::multiset; +#endif + +#ifdef _GLIBCXX_STACK +using std::stack; +#endif + +#ifdef _GLIBCXX_QUEUE +using std::queue; +#endif + +#ifdef _GLIBCXX_DEQUE +using std::deque; +#endif + +#ifdef _GLIBCXX_STRING +using std::string; +#endif + +#ifdef _GLIBCXX_IOSTREAM +using std::cout; +using std::cin; +using std::cerr; +#endif + +#ifdef _GLIBCXX_OSTREAM +using std::ostream; +using std::endl; +#endif + +#ifdef _GLIBCXX_IOS +using std::fixed; +using std::hex; +using std::oct; +using std::dec; +using std::left; +using std::ios; +#endif + +#ifdef _GLIBCXX_IOMANIP +using std::setprecision; +using std::setfill; +using std::setw; +#endif + + +#ifdef _GLIBCXX_FSTREAM +using std::fstream; +using std::ifstream; +using std::ofstream; +#endif + + +#ifdef _GLIBCXX_SSTREAM +using std::stringstream; +using std::istringstream; +using std::ostringstream; +#endif + +#ifdef _GLIBCXX_ALGORITHM +using std::swap; +using std::min; +using std::max; +using std::sort; +#endif + +#ifdef _GLIBCXX_MEMORY +using std::make_shared; +using std::shared_ptr; +using std::unique_ptr; +#endif + +#ifdef _GLIBCXX_UTILITY +using std::move; +#endif + +#ifdef _NEW +using std::nothrow; +#endif + +#ifdef BOOST_THREAD_THREAD_COMMON_HPP +using boost::thread; +#endif + +#ifdef BOOST_THREAD_DETAIL_THREAD_GROUP_HPP +using boost::thread_group; +#endif + +#ifdef BOOST_THREAD_MUTEX_HPP +using boost::mutex; +using boost::try_mutex; +#endif + +#ifdef BOOST_LEXICAL_CAST_INCLUDED +using boost::lexical_cast; +#endif + +#ifdef BOOST_THREAD_PTHREAD_SHARED_MUTEX_HPP +using boost::shared_mutex; +#endif + + +/// In older versions of boost, when including mutex.hpp, it would include locks.hpp that +/// would in turn provide lock_guard<>. In more recent versions, including mutex.hpp would +/// include lock_types.hpp that does not provide lock_guard<>. This check verifies if boost +/// locks have been included and makes sure to only include lock_guard if the provided lock +/// implementations were not included using lock_types.hpp (for older boost versions) or if +/// lock_guard.hpp was explicitly included. +#if (defined(BOOST_THREAD_LOCKS_HPP) && BOOST_VERSION < 105300) || defined(BOOST_THREAD_LOCK_GUARD_HPP) +using boost::lock_guard; +#endif + +#if defined(BOOST_THREAD_LOCKS_HPP) || defined(BOOST_THREAD_LOCK_TYPES_HPP) +using boost::unique_lock; +using boost::shared_lock; +using boost::upgrade_lock; +#endif + +#ifdef BOOST_SMART_PTR_SCOPED_PTR_HPP_INCLUDED +using boost::scoped_ptr; +#endif + +#ifdef BOOST_UNORDERED_MAP_HPP_INCLUDED +using boost::unordered_map; +#endif + +#ifdef BOOST_UNORDERED_SET_HPP_INCLUDED +using boost::unordered_set; +#endif + +#ifdef BOOST_FUNCTION_PROLOGUE_HPP +using boost::function; +#endif + +#ifdef BOOST_BIND_HPP_INCLUDED +using boost::bind; +using boost::mem_fn; +#endif + +#ifdef STRINGS_SUBSTITUTE_H_ +using strings::Substitute; +#endif diff --git a/be/src/common/status.cpp b/be/src/common/status.cpp index f66aca4674..9e5bf3bc9a 100644 --- a/be/src/common/status.cpp +++ b/be/src/common/status.cpp @@ -136,4 +136,14 @@ void Status::to_thrift(TStatus* status) const { } } +void Status::MergeStatus(const Status& status) { + if (status.ok()) return; + if (_error_detail == NULL) { + _error_detail = new ErrorDetail(status.code()); + } else { + std::vector msgs_vector; + status.get_error_msgs(&msgs_vector); + for (const std::string& s: msgs_vector) add_error_msg(s); + } +} } diff --git a/be/src/common/status.h b/be/src/common/status.h index ba32b2973d..f10a8be969 100644 --- a/be/src/common/status.h +++ b/be/src/common/status.h @@ -164,6 +164,11 @@ public: return _error_detail == NULL ? TStatusCode::OK : _error_detail->error_code; } + /// Does nothing if status.ok(). + /// Otherwise: if 'this' is an error status, adds the error msg from 'status'; + /// otherwise assigns 'status'. + void MergeStatus(const Status& status); + private: struct ErrorDetail { TStatusCode::type error_code; // anything other than OK diff --git a/be/src/exec/CMakeLists.txt b/be/src/exec/CMakeLists.txt index d555f176ad..3283debc98 100644 --- a/be/src/exec/CMakeLists.txt +++ b/be/src/exec/CMakeLists.txt @@ -80,6 +80,10 @@ set(EXEC_FILES partitioned_hash_table_ir.cc partitioned_aggregation_node.cc partitioned_aggregation_node_ir.cc + new_partitioned_hash_table.cc + new_partitioned_hash_table_ir.cc + new_partitioned_aggregation_node.cc + new_partitioned_aggregation_node_ir.cc local_file_writer.cpp broker_writer.cpp ) diff --git a/be/src/exec/aggregation_node.cpp b/be/src/exec/aggregation_node.cpp index 8a2a2558a7..23949a4f9d 100644 --- a/be/src/exec/aggregation_node.cpp +++ b/be/src/exec/aggregation_node.cpp @@ -18,1010 +18,1006 @@ // specific language governing permissions and limitations // under the License. -#include "exec/aggregation_node.h" - -#include -#include -#include -#include -#include -#include - -#include "codegen/codegen_anyval.h" -#include "codegen/llvm_codegen.h" -#include "exec/hash_table.hpp" -#include "exprs/agg_fn_evaluator.h" -#include "exprs/expr.h" -#include "exprs/slot_ref.h" -#include "gen_cpp/Exprs_types.h" -#include "gen_cpp/PlanNodes_types.h" -#include "runtime/descriptors.h" -#include "runtime/mem_pool.h" -#include "runtime/raw_value.h" -#include "runtime/row_batch.h" -#include "runtime/runtime_state.h" -#include "runtime/string_value.hpp" -#include "runtime/tuple.h" -#include "runtime/tuple_row.h" -#include "util/debug_util.h" -#include "util/runtime_profile.h" - -using llvm::BasicBlock; -using llvm::Function; -using llvm::PointerType; -using llvm::Type; -using llvm::Value; -using llvm::StructType; - -namespace palo { - -const char* AggregationNode::_s_llvm_class_name = "class.palo::AggregationNode"; - -// TODO: pass in maximum size; enforce by setting limit in mempool -// TODO: have a Status ExecNode::init(const TPlanNode&) member function -// that does initialization outside of c'tor, so we can indicate errors -AggregationNode::AggregationNode( - ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs) : - ExecNode(pool, tnode, descs), - _intermediate_tuple_id(tnode.agg_node.intermediate_tuple_id), - _intermediate_tuple_desc(NULL), - _output_tuple_id(tnode.agg_node.output_tuple_id), - _output_tuple_desc(NULL), - _singleton_output_tuple(NULL), - //_tuple_pool(new MemPool()), - // - _codegen_process_row_batch_fn(NULL), - _process_row_batch_fn(NULL), - _needs_finalize(tnode.agg_node.need_finalize), - _build_timer(NULL), - _get_results_timer(NULL), - _hash_table_buckets_counter(NULL) { -} - -AggregationNode::~AggregationNode() { -} - -Status AggregationNode::init(const TPlanNode& tnode) { - RETURN_IF_ERROR(ExecNode::init(tnode)); - // ignore return status for now , so we need to introduct ExecNode::init() - RETURN_IF_ERROR(Expr::create_expr_trees( - _pool, tnode.agg_node.grouping_exprs, &_probe_expr_ctxs)); - - for (int i = 0; i < tnode.agg_node.aggregate_functions.size(); ++i) { - AggFnEvaluator* evaluator = NULL; - AggFnEvaluator::create( - _pool, tnode.agg_node.aggregate_functions[i], &evaluator); - _aggregate_evaluators.push_back(evaluator); - } - return Status::OK; -} - -Status AggregationNode::prepare(RuntimeState* state) { - RETURN_IF_ERROR(ExecNode::prepare(state)); - _build_timer = ADD_TIMER(runtime_profile(), "BuildTime"); - _get_results_timer = ADD_TIMER(runtime_profile(), "GetResultsTime"); - _hash_table_buckets_counter = - ADD_COUNTER(runtime_profile(), "BuildBuckets", TUnit::UNIT); - _hash_table_load_factor_counter = - ADD_COUNTER(runtime_profile(), "LoadFactor", TUnit::DOUBLE_VALUE); - - SCOPED_TIMER(_runtime_profile->total_time_counter()); - - _intermediate_tuple_desc = - state->desc_tbl().get_tuple_descriptor(_intermediate_tuple_id); - _output_tuple_desc = state->desc_tbl().get_tuple_descriptor(_output_tuple_id); - DCHECK_EQ(_intermediate_tuple_desc->slots().size(), _output_tuple_desc->slots().size()); - RETURN_IF_ERROR(Expr::prepare( - _probe_expr_ctxs, state, child(0)->row_desc(), expr_mem_tracker())); - - // Construct build exprs from _agg_tuple_desc - for (int i = 0; i < _probe_expr_ctxs.size(); ++i) { - SlotDescriptor* desc = _intermediate_tuple_desc->slots()[i]; - Expr* expr = new SlotRef(desc); - state->obj_pool()->add(expr); - _build_expr_ctxs.push_back(new ExprContext(expr)); - state->obj_pool()->add(_build_expr_ctxs.back()); - } - - // Construct a new row desc for preparing the build exprs because neither the child's - // nor this node's output row desc may contain the intermediate tuple, e.g., - // in a single-node plan with an intermediate tuple different from the output tuple. - RowDescriptor build_row_desc(_intermediate_tuple_desc, false); - RETURN_IF_ERROR(Expr::prepare( - _build_expr_ctxs, state, build_row_desc, expr_mem_tracker())); - - _tuple_pool.reset(new MemPool(mem_tracker(), 0)); - - _agg_fn_ctxs.resize(_aggregate_evaluators.size()); - int j = _probe_expr_ctxs.size(); - for (int i = 0; i < _aggregate_evaluators.size(); ++i, ++j) { - // skip non-materialized slots; we don't have evaluators instantiated for those - // while (!_agg_tuple_desc->slots()[j]->is_materialized()) { - // DCHECK_LT(j, _agg_tuple_desc->slots().size() - 1) - // << "#eval= " << _aggregate_evaluators.size() - // << " #probe=" << _probe_expr_ctxs.size(); - // ++j; - // } - SlotDescriptor* intermediate_slot_desc = _intermediate_tuple_desc->slots()[j]; - SlotDescriptor* output_slot_desc = _output_tuple_desc->slots()[j]; - RETURN_IF_ERROR(_aggregate_evaluators[i]->prepare( - state, child(0)->row_desc(), _tuple_pool.get(), - intermediate_slot_desc, output_slot_desc, mem_tracker(), &_agg_fn_ctxs[i])); - state->obj_pool()->add(_agg_fn_ctxs[i]); - } - - // TODO: how many buckets? - _hash_tbl.reset(new HashTable( - _build_expr_ctxs, _probe_expr_ctxs, 1, true, id(), mem_tracker(), 1024)); - - if (_probe_expr_ctxs.empty()) { - // create single output tuple now; we need to output something - // even if our input is empty - _singleton_output_tuple = construct_intermediate_tuple(); - } - - if (state->codegen_level() > 0) { - LlvmCodeGen* codegen = NULL; - RETURN_IF_ERROR(state->get_codegen(&codegen)); - Function* update_tuple_fn = codegen_update_tuple(state); - if (update_tuple_fn != NULL) { - _codegen_process_row_batch_fn = - codegen_process_row_batch(state, update_tuple_fn); - if (_codegen_process_row_batch_fn != NULL) { - // Update to using codegen'd process row batch. - codegen->add_function_to_jit(_codegen_process_row_batch_fn, - reinterpret_cast(&_process_row_batch_fn)); - // AddRuntimeExecOption("Codegen Enabled"); - } - } - } - - return Status::OK; -} - -Status AggregationNode::open(RuntimeState* state) { - RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::OPEN)); - SCOPED_TIMER(_runtime_profile->total_time_counter()); - RETURN_IF_ERROR(ExecNode::open(state)); - RETURN_IF_ERROR(Expr::open(_probe_expr_ctxs, state)); - RETURN_IF_ERROR(Expr::open(_build_expr_ctxs, state)); - - for (int i = 0; i < _aggregate_evaluators.size(); ++i) { - RETURN_IF_ERROR(_aggregate_evaluators[i]->open(state, _agg_fn_ctxs[i])); - } - - RETURN_IF_ERROR(_children[0]->open(state)); - - RowBatch batch(_children[0]->row_desc(), state->batch_size(), mem_tracker()); - int64_t num_input_rows = 0; - int64_t num_agg_rows = 0; - - bool early_return = false; - bool limit_with_no_agg = (limit() != -1 && (_aggregate_evaluators.size() == 0)); - DCHECK_EQ(_aggregate_evaluators.size(), _agg_fn_ctxs.size()); - - while (true) { - bool eos = false; - RETURN_IF_CANCELLED(state); - RETURN_IF_ERROR(state->check_query_state()); - RETURN_IF_ERROR(_children[0]->get_next(state, &batch, &eos)); - // SCOPED_TIMER(_build_timer); - if (VLOG_ROW_IS_ON) { - for (int i = 0; i < batch.num_rows(); ++i) { - TupleRow* row = batch.get_row(i); - VLOG_ROW << "id=" << id() << " input row: " - << print_row(row, _children[0]->row_desc()); - } - } - - int64_t agg_rows_before = _hash_tbl->size(); - - if (_process_row_batch_fn != NULL) { - _process_row_batch_fn(this, &batch); - } else if (_singleton_output_tuple != NULL) { - SCOPED_TIMER(_build_timer); - process_row_batch_no_grouping(&batch, _tuple_pool.get()); - } else { - process_row_batch_with_grouping(&batch, _tuple_pool.get()); - if (limit_with_no_agg) { - if (_hash_tbl->size() >= limit()) { - early_return = true; - } - } - } - - // RETURN_IF_LIMIT_EXCEEDED(state); - RETURN_IF_ERROR(state->check_query_state()); - - COUNTER_SET(_hash_table_buckets_counter, _hash_tbl->num_buckets()); - COUNTER_SET(memory_used_counter(), - _tuple_pool->peak_allocated_bytes() + _hash_tbl->byte_size()); - COUNTER_SET(_hash_table_load_factor_counter, _hash_tbl->load_factor()); - num_agg_rows += (_hash_tbl->size() - agg_rows_before); - num_input_rows += batch.num_rows(); - - batch.reset(); - - RETURN_IF_ERROR(state->check_query_state()); - if (eos) { - break; - } - if (early_return) { - break; - } - } - - if (_singleton_output_tuple != NULL) { - _hash_tbl->insert(reinterpret_cast(&_singleton_output_tuple)); - ++num_agg_rows; - } - - VLOG_ROW << "id=" << id() << " aggregated " << num_input_rows << " input rows into " - << num_agg_rows << " output rows"; - _output_iterator = _hash_tbl->begin(); - return Status::OK; -} - -Status AggregationNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { - SCOPED_TIMER(_runtime_profile->total_time_counter()); - RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); - RETURN_IF_CANCELLED(state); - RETURN_IF_ERROR(state->check_query_state()); - SCOPED_TIMER(_get_results_timer); - - if (reached_limit()) { - *eos = true; - return Status::OK; - } - - ExprContext** ctxs = &_conjunct_ctxs[0]; - int num_ctxs = _conjunct_ctxs.size(); - - int count = 0; - const int N = state->batch_size(); - while (!_output_iterator.at_end() && !row_batch->at_capacity()) { - // This loop can go on for a long time if the conjuncts are very selective. Do query - // maintenance every N iterations. - if (count++ % N == 0) { - RETURN_IF_CANCELLED(state); - RETURN_IF_ERROR(state->check_query_state()); - } - int row_idx = row_batch->add_row(); - TupleRow* row = row_batch->get_row(row_idx); - Tuple* intermediate_tuple = _output_iterator.get_row()->get_tuple(0); - Tuple* output_tuple = - finalize_tuple(intermediate_tuple, row_batch->tuple_data_pool()); - row->set_tuple(0, output_tuple); - - if (ExecNode::eval_conjuncts(ctxs, num_ctxs, row)) { - VLOG_ROW << "output row: " << print_row(row, row_desc()); - row_batch->commit_last_row(); - ++_num_rows_returned; - - if (reached_limit()) { - // avoid calling finalize() duplicately with last tuple - // when _output_iterator don't reach end. - // chenhao added - _output_iterator.next(); - break; - } - } - - _output_iterator.next(); - } - - *eos = _output_iterator.at_end() || reached_limit(); - if (*eos) { - if (memory_used_counter() != NULL && _hash_tbl.get() != NULL && - _hash_table_buckets_counter != NULL) { - COUNTER_SET(memory_used_counter(), - _tuple_pool->peak_allocated_bytes() + _hash_tbl->byte_size()); - COUNTER_SET(_hash_table_buckets_counter, _hash_tbl->num_buckets()); - } - } - COUNTER_SET(_rows_returned_counter, _num_rows_returned); - return Status::OK; -} - -Status AggregationNode::close(RuntimeState* state) { - if (is_closed()) { - return Status::OK; - } - - // Iterate through the remaining rows in the hash table and call Serialize/Finalize on - // them in order to free any memory allocated by UDAs. Finalize() requires a dst tuple - // but we don't actually need the result, so allocate a single dummy tuple to avoid - // accumulating memory. - Tuple* dummy_dst = NULL; - if (_needs_finalize && _output_tuple_desc != NULL) { - dummy_dst = Tuple::create(_output_tuple_desc->byte_size(), _tuple_pool.get()); - } - while (!_output_iterator.at_end()) { - Tuple* tuple = _output_iterator.get_row()->get_tuple(0); - if (_needs_finalize) { - AggFnEvaluator::finalize(_aggregate_evaluators, _agg_fn_ctxs, tuple, dummy_dst); - } else { - AggFnEvaluator::serialize(_aggregate_evaluators, _agg_fn_ctxs, tuple); - } - _output_iterator.next(); - } - - for (int i = 0; i < _aggregate_evaluators.size(); ++i) { - _aggregate_evaluators[i]->close(state); - if (!_agg_fn_ctxs.empty() && _agg_fn_ctxs[i] && _agg_fn_ctxs[i]->impl()) { - _agg_fn_ctxs[i]->impl()->close(); - } - } - - if (_tuple_pool.get() != NULL) { - _tuple_pool->free_all(); - } - if (_hash_tbl.get() != NULL) { - _hash_tbl->close(); - } - - Expr::close(_probe_expr_ctxs, state); - Expr::close(_build_expr_ctxs, state); - - return ExecNode::close(state); -} - -Tuple* AggregationNode::construct_intermediate_tuple() { - Tuple* agg_tuple = Tuple::create(_intermediate_tuple_desc->byte_size(), _tuple_pool.get()); - vector::const_iterator slot_desc = _intermediate_tuple_desc->slots().begin(); - - // copy grouping values - for (int i = 0; i < _probe_expr_ctxs.size(); ++i, ++slot_desc) { - if (_hash_tbl->last_expr_value_null(i)) { - agg_tuple->set_null((*slot_desc)->null_indicator_offset()); - } else { - void* src = _hash_tbl->last_expr_value(i); - void* dst = agg_tuple->get_slot((*slot_desc)->tuple_offset()); - RawValue::write(src, dst, (*slot_desc)->type(), _tuple_pool.get()); - } - } - - // Initialize aggregate output. - for (int i = 0; i < _aggregate_evaluators.size(); ++i, ++slot_desc) { - while (!(*slot_desc)->is_materialized()) { - ++slot_desc; - } - - AggFnEvaluator* evaluator = _aggregate_evaluators[i]; - evaluator->init(_agg_fn_ctxs[i], agg_tuple); - - // Codegen specific path. - // To minimize branching on the UpdateAggTuple path, initialize the result value - // so that UpdateAggTuple doesn't have to check if the aggregation - // dst slot is null. - // - sum/count: 0 - // - min: max_value - // - max: min_value - // TODO: remove when we don't use the irbuilder for codegen here. - // This optimization no longer applies with AnyVal - if (!(*slot_desc)->type().is_string_type() && - !(*slot_desc)->type().is_date_type()) { - ExprValue default_value; - void* default_value_ptr = NULL; - - switch (evaluator->agg_op()) { - case TAggregationOp::MIN: - default_value_ptr = default_value.set_to_max((*slot_desc)->type()); - RawValue::write(default_value_ptr, agg_tuple, *slot_desc, NULL); - break; - - case TAggregationOp::MAX: - default_value_ptr = default_value.set_to_min((*slot_desc)->type()); - RawValue::write(default_value_ptr, agg_tuple, *slot_desc, NULL); - break; - - default: - break; - } - } - } - - return agg_tuple; -} - -void AggregationNode::update_tuple(Tuple* tuple, TupleRow* row) { - DCHECK(tuple != NULL); - - AggFnEvaluator::add(_aggregate_evaluators, _agg_fn_ctxs, row, tuple); -#if 0 - vector::const_iterator evaluator; - int i = 0; - for (evaluator = _aggregate_evaluators.begin(); - evaluator != _aggregate_evaluators.end(); ++evaluator, ++i) { - (*evaluator)->choose_update_or_merge(_agg_fn_ctxs[i], row, tuple); - //if (_is_merge) { - // (*evaluator)->merge(_agg_fn_ctxs[i], row, tuple, pool); - //} else { - // (*evaluator)->update(_agg_fn_ctxs[i], row, tuple, pool); - //} - } -#endif -} - -Tuple* AggregationNode::finalize_tuple(Tuple* tuple, MemPool* pool) { - DCHECK(tuple != NULL); - - Tuple* dst = tuple; - if (_needs_finalize && _intermediate_tuple_id != _output_tuple_id) { - dst = Tuple::create(_output_tuple_desc->byte_size(), pool); - } - if (_needs_finalize) { - AggFnEvaluator::finalize(_aggregate_evaluators, _agg_fn_ctxs, tuple, dst); - } else { - AggFnEvaluator::serialize(_aggregate_evaluators, _agg_fn_ctxs, tuple); - } - // Copy grouping values from tuple to dst. - // TODO: Codegen this. - if (dst != tuple) { - int num_grouping_slots = _probe_expr_ctxs.size(); - for (int i = 0; i < num_grouping_slots; ++i) { - SlotDescriptor* src_slot_desc = _intermediate_tuple_desc->slots()[i]; - SlotDescriptor* dst_slot_desc = _output_tuple_desc->slots()[i]; - bool src_slot_null = tuple->is_null(src_slot_desc->null_indicator_offset()); - void* src_slot = NULL; - if (!src_slot_null) src_slot = tuple->get_slot(src_slot_desc->tuple_offset()); - RawValue::write(src_slot, dst, dst_slot_desc, NULL); - } - } - return dst; -} - -void AggregationNode::debug_string(int indentation_level, std::stringstream* out) const { - *out << std::string(indentation_level * 2, ' '); - *out << "AggregationNode(intermediate_tuple_id=" << _intermediate_tuple_id - << " output_tuple_id=" << _output_tuple_id - << " needs_finalize=" << _needs_finalize - // << " probe_exprs=" << Expr::debug_string(_probe_exprs) - << " agg_exprs=" << AggFnEvaluator::debug_string(_aggregate_evaluators); - ExecNode::debug_string(indentation_level, out); - *out << ")"; -} - -void AggregationNode::push_down_predicate(RuntimeState *state, - std::list *expr_ctxs) { - // groupby can pushdown, agg can't pushdown - // Now we doesn't pushdown for easy. - return; -} - -static IRFunction::Type get_hll_update_function2(const TypeDescriptor& type) { - switch (type.type) { - case TYPE_BOOLEAN: - return IRFunction::HLL_UPDATE_BOOLEAN; - case TYPE_TINYINT: - return IRFunction::HLL_UPDATE_TINYINT; - case TYPE_SMALLINT: - return IRFunction::HLL_UPDATE_SMALLINT; - case TYPE_INT: - return IRFunction::HLL_UPDATE_INT; - case TYPE_BIGINT: - return IRFunction::HLL_UPDATE_BIGINT; - case TYPE_FLOAT: - return IRFunction::HLL_UPDATE_FLOAT; - case TYPE_DOUBLE: - return IRFunction::HLL_UPDATE_DOUBLE; - case TYPE_CHAR: - case TYPE_VARCHAR: - return IRFunction::HLL_UPDATE_STRING; - case TYPE_DECIMAL: - return IRFunction::HLL_UPDATE_DECIMAL; - default: - DCHECK(false) << "Unsupported type: " << type; - return IRFunction::FN_END; - } -} - -// IR Generation for updating a single aggregation slot. Signature is: -// void update_slot(FunctionContext* fn_ctx, AggTuple* agg_tuple, char** row) -// -// The IR for sum(double_col) is: -// define void @update_slot(%"class.palo_udf::FunctionContext"* %fn_ctx, -// { i8, double }* %agg_tuple, -// %"class.palo::TupleRow"* %row) #20 { -// entry: -// %src = call { i8, double } @GetSlotRef(%"class.palo::ExprContext"* inttoptr -// (i64 128241264 to %"class.palo::ExprContext"*), %"class.palo::TupleRow"* %row) -// %0 = extractvalue { i8, double } %src, 0 -// %is_null = trunc i8 %0 to i1 -// br i1 %is_null, label %ret, label %src_not_null -// -// src_not_null: ; preds = %entry -// %dst_slot_ptr = getelementptr inbounds { i8, double }* %agg_tuple, i32 0, i32 1 -// call void @SetNotNull({ i8, double }* %agg_tuple) -// %dst_val = load double* %dst_slot_ptr -// %val = extractvalue { i8, double } %src, 1 -// %1 = fadd double %dst_val, %val -// store double %1, double* %dst_slot_ptr -// br label %ret -// -// ret: ; preds = %src_not_null, %entry -// ret void -// } -// -// The IR for min(double_col) is: -// define void @update_slot(%"class.palo_udf::FunctionContext"* %fn_ctx, -// { i8, double }* %agg_tuple, -// %"class.palo::TupleRow"* %row) #20 { -// entry: -// %src = call { i8, double } @GetSlotRef(%"class.palo::ExprContext"* inttoptr -// (i64 128241264 to %"class.palo::ExprContext"*), %"class.palo::TupleRow"* %row) -// %0 = extractvalue { i8, double } %src, 0 -// %is_null = trunc i8 %0 to i1 -// br i1 %is_null, label %ret, label %src_not_null -// -// src_not_null: ; preds = %entry -// %dst_is_null = call i8 @is_null(tuple); -// br i1 %dst_is_null, label dst_null, label dst_not_null -// -// dst_null: ; preds = %entry -// %dst_slot_ptr = getelementptr inbounds { i8, double }* %agg_tuple, i32 0, i32 1 -// call void @SetNotNull({ i8, double }* %agg_tuple) -// %val = extractvalue { i8, double } %src, 1 -// store double %val, double* %dst_slot_ptr -// br label %ret -// -// dst_not_null: ; preds = %src_not_null -// %dst_slot_ptr = getelementptr inbounds { i8, double }* %agg_tuple, i32 0, i32 1 -// call void @SetNotNull({ i8, double }* %agg_tuple) -// %dst_val = load double* %dst_slot_ptr -// %val = extractvalue { i8, double } %src, 1 -// %1 = fadd double %dst_val, %val -// store double %1, double* %dst_slot_ptr -// br label %ret -// -// ret: ; preds = %src_not_null, %entry -// ret void -// } -// The IR for ndv(double_col) is: -// define void @update_slot(%"class.palo_udf::FunctionContext"* %fn_ctx, -// { i8, %"struct.palo::StringValue" }* %agg_tuple, -// %"class.palo::TupleRow"* %row) #20 { -// entry: -// %dst_lowered_ptr = alloca { i64, i8* } -// %src_lowered_ptr = alloca { i8, double } -// %src = call { i8, double } @GetSlotRef(%"class.palo::ExprContext"* inttoptr -// (i64 120530832 to %"class.palo::ExprContext"*), %"class.palo::TupleRow"* %row) -// %0 = extractvalue { i8, double } %src, 0 -// %is_null = trunc i8 %0 to i1 -// br i1 %is_null, label %ret, label %src_not_null -// -// src_not_null: ; preds = %entry -// %dst_slot_ptr = getelementptr inbounds -// { i8, %"struct.palo::StringValue" }* %agg_tuple, i32 0, i32 1 -// call void @SetNotNull({ i8, %"struct.palo::StringValue" }* %agg_tuple) -// %dst_val = load %"struct.palo::StringValue"* %dst_slot_ptr -// store { i8, double } %src, { i8, double }* %src_lowered_ptr -// %src_unlowered_ptr = bitcast { i8, double }* %src_lowered_ptr -// to %"struct.palo_udf::DoubleVal"* -// %ptr = extractvalue %"struct.palo::StringValue" %dst_val, 0 -// %dst_stringval = insertvalue { i64, i8* } zeroinitializer, i8* %ptr, 1 -// %len = extractvalue %"struct.palo::StringValue" %dst_val, 1 -// %1 = extractvalue { i64, i8* } %dst_stringval, 0 -// %2 = zext i32 %len to i64 -// %3 = shl i64 %2, 32 -// %4 = and i64 %1, 4294967295 -// %5 = or i64 %4, %3 -// %dst_stringval1 = insertvalue { i64, i8* } %dst_stringval, i64 %5, 0 -// store { i64, i8* } %dst_stringval1, { i64, i8* }* %dst_lowered_ptr -// %dst_unlowered_ptr = bitcast { i64, i8* }* %dst_lowered_ptr -// to %"struct.palo_udf::StringVal"* -// call void @HllUpdate(%"class.palo_udf::FunctionContext"* %fn_ctx, -// %"struct.palo_udf::DoubleVal"* %src_unlowered_ptr, -// %"struct.palo_udf::StringVal"* %dst_unlowered_ptr) -// %anyval_result = load { i64, i8* }* %dst_lowered_ptr -// %6 = extractvalue { i64, i8* } %anyval_result, 1 -// %7 = insertvalue %"struct.palo::StringValue" zeroinitializer, i8* %6, 0 -// %8 = extractvalue { i64, i8* } %anyval_result, 0 -// %9 = ashr i64 %8, 32 -// %10 = trunc i64 %9 to i32 -// %11 = insertvalue %"struct.palo::StringValue" %7, i32 %10, 1 -// store %"struct.palo::StringValue" %11, %"struct.palo::StringValue"* %dst_slot_ptr -// br label %ret -// -// ret: ; preds = %src_not_null, %entry -// ret void -// } -llvm::Function* AggregationNode::codegen_update_slot( - RuntimeState* state, AggFnEvaluator* evaluator, SlotDescriptor* slot_desc) { - DCHECK(slot_desc->is_materialized()); - LlvmCodeGen* codegen = NULL; - if (!state->get_codegen(&codegen).ok()) { - return NULL; - } - - DCHECK_EQ(evaluator->input_expr_ctxs().size(), 1); - ExprContext* input_expr_ctx = evaluator->input_expr_ctxs()[0]; - Expr* input_expr = input_expr_ctx->root(); - // TODO: implement timestamp - if (input_expr->type().type == TYPE_DATETIME - || input_expr->type().type == TYPE_DATE - || input_expr->type().type == TYPE_DECIMAL - || input_expr->type().is_string_type()) { - return NULL; - } - Function* agg_expr_fn = NULL; - Status status = input_expr->get_codegend_compute_fn(state, &agg_expr_fn); - if (!status.ok()) { - LOG(INFO) << "Could not codegen update_slot(): " << status.get_error_msg(); - return NULL; - } - DCHECK(agg_expr_fn != NULL); - - PointerType* fn_ctx_type = - codegen->get_ptr_type(FunctionContextImpl::_s_llvm_functioncontext_name); - StructType* tuple_struct = _intermediate_tuple_desc->generate_llvm_struct(codegen); - PointerType* tuple_ptr_type = PointerType::get(tuple_struct, 0); - PointerType* tuple_row_ptr_type = codegen->get_ptr_type(TupleRow::_s_llvm_class_name); - - // Create update_slot prototype - LlvmCodeGen::FnPrototype prototype(codegen, "update_slot", codegen->void_type()); - prototype.add_argument(LlvmCodeGen::NamedVariable("fn_ctx", fn_ctx_type)); - prototype.add_argument(LlvmCodeGen::NamedVariable("agg_tuple", tuple_ptr_type)); - prototype.add_argument(LlvmCodeGen::NamedVariable("row", tuple_row_ptr_type)); - - LlvmCodeGen::LlvmBuilder builder(codegen->context()); - Value* args[3]; - Function* fn = prototype.generate_prototype(&builder, &args[0]); - Value* fn_ctx_arg = args[0]; - Value* agg_tuple_arg = args[1]; - Value* row_arg = args[2]; - - BasicBlock* src_not_null_block = NULL; - BasicBlock* dst_null_block = NULL; - BasicBlock* dst_not_null_block = NULL; - if (evaluator->agg_op() == AggFnEvaluator::MIN - || evaluator->agg_op() == AggFnEvaluator::MAX) { - src_not_null_block = BasicBlock::Create(codegen->context(), "src_not_null", fn); - dst_null_block = BasicBlock::Create(codegen->context(), "dst_null", fn); - } - dst_not_null_block = BasicBlock::Create(codegen->context(), "dst_not_null", fn); - BasicBlock* ret_block = BasicBlock::Create(codegen->context(), "ret", fn); - - // Call expr function to get src slot value - Value* ctx_arg = codegen->cast_ptr_to_llvm_ptr( - codegen->get_ptr_type(ExprContext::_s_llvm_class_name), input_expr_ctx); - Value* agg_expr_fn_args[] = { ctx_arg, row_arg }; - CodegenAnyVal src = CodegenAnyVal::create_call_wrapped( - codegen, &builder, input_expr->type(), agg_expr_fn, agg_expr_fn_args, "src", NULL); - - Value* src_is_null = src.get_is_null(); - if (evaluator->agg_op() == AggFnEvaluator::MIN - || evaluator->agg_op() == AggFnEvaluator::MAX) { - builder.CreateCondBr(src_is_null, ret_block, src_not_null_block); - - // Src slot is not null - builder.SetInsertPoint(src_not_null_block); - Function* is_null_fn = slot_desc->codegen_is_null(codegen, tuple_struct); - Value* dst_is_null = builder.CreateCall(is_null_fn, agg_tuple_arg); - builder.CreateCondBr(dst_is_null, dst_null_block, dst_not_null_block); - // dst slot is null - builder.SetInsertPoint(dst_null_block); - Value* dst_ptr = - builder.CreateStructGEP(agg_tuple_arg, slot_desc->field_idx(), "dst_slot_ptr"); - if (slot_desc->is_nullable()) { - // Dst is NULL, just update dst slot to src slot and clear null bit - Function* clear_null_fn = slot_desc->codegen_update_null(codegen, tuple_struct, false); - builder.CreateCall(clear_null_fn, agg_tuple_arg); - } - builder.CreateStore(src.get_val(), dst_ptr); - builder.CreateBr(ret_block); - } else { - builder.CreateCondBr(src_is_null, ret_block, dst_not_null_block); - } - - - // Src slot is not null, update dst_slot - builder.SetInsertPoint(dst_not_null_block); - Value* dst_ptr = - builder.CreateStructGEP(agg_tuple_arg, slot_desc->field_idx(), "dst_slot_ptr"); - Value* result = NULL; - - if (slot_desc->is_nullable()) { - // Dst is NULL, just update dst slot to src slot and clear null bit - Function* clear_null_fn = slot_desc->codegen_update_null(codegen, tuple_struct, false); - builder.CreateCall(clear_null_fn, agg_tuple_arg); - } - - // Update the slot - Value* dst_value = builder.CreateLoad(dst_ptr, "dst_val"); - switch (evaluator->agg_op()) { - case AggFnEvaluator::COUNT: - if (evaluator->is_merge()) { - result = builder.CreateAdd(dst_value, src.get_val(), "count_sum"); - } else { - result = builder.CreateAdd( - dst_value, codegen->get_int_constant(TYPE_BIGINT, 1), "count_inc"); - } - break; - case AggFnEvaluator::MIN: { - Function* min_fn = codegen->codegen_min_max(slot_desc->type(), true); - Value* min_args[] = { dst_value, src.get_val() }; - result = builder.CreateCall(min_fn, min_args, "min_value"); - break; - } - case AggFnEvaluator::MAX: { - Function* max_fn = codegen->codegen_min_max(slot_desc->type(), false); - Value* max_args[] = { dst_value, src.get_val() }; - result = builder.CreateCall(max_fn, max_args, "max_value"); - break; - } - case AggFnEvaluator::SUM: - if (slot_desc->type().type == TYPE_FLOAT || slot_desc->type().type == TYPE_DOUBLE) { - result = builder.CreateFAdd(dst_value, src.get_val()); - } else { - result = builder.CreateAdd(dst_value, src.get_val()); - } - break; - case AggFnEvaluator::NDV: { - DCHECK_EQ(slot_desc->type().type, TYPE_VARCHAR); - IRFunction::Type ir_function_type = evaluator->is_merge() ? IRFunction::HLL_MERGE - : get_hll_update_function2(input_expr->type()); - Function* hll_fn = codegen->get_function(ir_function_type); - - // Create pointer to src_anyval to pass to HllUpdate() function. We must use the - // unlowered type. - Value* src_lowered_ptr = codegen->create_entry_block_alloca( - fn, LlvmCodeGen::NamedVariable("src_lowered_ptr", src.value()->getType())); - builder.CreateStore(src.value(), src_lowered_ptr); - Type* unlowered_ptr_type = - CodegenAnyVal::get_unlowered_type(codegen, input_expr->type())->getPointerTo(); - Value* src_unlowered_ptr = - builder.CreateBitCast(src_lowered_ptr, unlowered_ptr_type, "src_unlowered_ptr"); - - // Create StringVal* intermediate argument from dst_value - CodegenAnyVal dst_stringval = CodegenAnyVal::get_non_null_val( - codegen, &builder, TypeDescriptor(TYPE_VARCHAR), "dst_stringval"); - dst_stringval.set_from_raw_value(dst_value); - // Create pointer to dst_stringval to pass to HllUpdate() function. We must use - // the unlowered type. - Value* dst_lowered_ptr = codegen->create_entry_block_alloca( - fn, LlvmCodeGen::NamedVariable("dst_lowered_ptr", - dst_stringval.value()->getType())); - builder.CreateStore(dst_stringval.value(), dst_lowered_ptr); - unlowered_ptr_type = - codegen->get_ptr_type(CodegenAnyVal::get_unlowered_type( - codegen, TypeDescriptor(TYPE_VARCHAR))); - Value* dst_unlowered_ptr = - builder.CreateBitCast(dst_lowered_ptr, unlowered_ptr_type, "dst_unlowered_ptr"); - - // Call 'hll_fn' - builder.CreateCall3(hll_fn, fn_ctx_arg, src_unlowered_ptr, dst_unlowered_ptr); - - // Convert StringVal intermediate 'dst_arg' back to StringValue - Value* anyval_result = builder.CreateLoad(dst_lowered_ptr, "anyval_result"); - result = CodegenAnyVal(codegen, &builder, TypeDescriptor(TYPE_VARCHAR), anyval_result) - .to_native_value(); - break; - } - default: - DCHECK(false) << "bad aggregate operator: " << evaluator->agg_op(); - } - - builder.CreateStore(result, dst_ptr); - builder.CreateBr(ret_block); - - builder.SetInsertPoint(ret_block); - builder.CreateRetVoid(); - - fn = codegen->finalize_function(fn); - return fn; -} - -// IR codegen for the update_tuple loop. This loop is query specific and -// based on the aggregate functions. The function signature must match the non- -// codegen'd update_tuple exactly. -// For the query: -// select count(*), count(int_col), sum(double_col) the IR looks like: -// -// define void @update_tuple(%"class.palo::AggregationNode"* %this_ptr, -// %"class.palo::Tuple"* %agg_tuple, -// %"class.palo::TupleRow"* %tuple_row) #20 { -// entry: -// %tuple = bitcast %"class.palo::Tuple"* %agg_tuple to { i8, i64, i64, double }* -// %src_slot = getelementptr inbounds { i8, i64, i64, double }* %tuple, i32 0, i32 1 -// %count_star_val = load i64* %src_slot -// %count_star_inc = add i64 %count_star_val, 1 -// store i64 %count_star_inc, i64* %src_slot -// call void @update_slot(%"class.palo_udf::FunctionContext"* inttoptr -// (i64 44521296 to %"class.palo_udf::FunctionContext"*), -// { i8, i64, i64, double }* %tuple, -// %"class.palo::TupleRow"* %tuple_row) -// call void @UpdateSlot5(%"class.palo_udf::FunctionContext"* inttoptr -// (i64 44521328 to %"class.palo_udf::FunctionContext"*), -// { i8, i64, i64, double }* %tuple, -// %"class.palo::TupleRow"* %tuple_row) -// ret void -// } -Function* AggregationNode::codegen_update_tuple(RuntimeState* state) { - LlvmCodeGen* codegen = NULL; - if (!state->get_codegen(&codegen).ok()) { - return NULL; - } - SCOPED_TIMER(codegen->codegen_timer()); - - int j = _probe_expr_ctxs.size(); - for (int i = 0; i < _aggregate_evaluators.size(); ++i, ++j) { - // skip non-materialized slots; we don't have evaluators instantiated for those - while (!_intermediate_tuple_desc->slots()[j]->is_materialized()) { - DCHECK_LT(j, _intermediate_tuple_desc->slots().size() - 1); - ++j; - } - SlotDescriptor* slot_desc = _intermediate_tuple_desc->slots()[j]; - AggFnEvaluator* evaluator = _aggregate_evaluators[i]; - - // Timestamp and char are never supported. NDV supports decimal and string but no - // other functions. - // TODO: the other aggregate functions might work with decimal as-is - // TODO(zc) - if (slot_desc->type().type == TYPE_DATETIME || slot_desc->type().type == TYPE_CHAR || - (evaluator->agg_op() != AggFnEvaluator::NDV && - (slot_desc->type().type == TYPE_DECIMAL || - slot_desc->type().type == TYPE_CHAR || - slot_desc->type().type == TYPE_VARCHAR))) { - LOG(INFO) << "Could not codegen UpdateIntermediateTuple because " - << "string, char, timestamp and decimal are not yet supported."; - return NULL; - } - if (evaluator->agg_op() == AggFnEvaluator::COUNT_DISTINCT - || evaluator->agg_op() == AggFnEvaluator::SUM_DISTINCT) { - return NULL; - } - - // Don't codegen things that aren't builtins (for now) - if (!evaluator->is_builtin()) { - return NULL; - } - } - - if (_intermediate_tuple_desc->generate_llvm_struct(codegen) == NULL) { - LOG(INFO) << "Could not codegen update_tuple because we could" - << "not generate a matching llvm struct for the intermediate tuple."; - return NULL; - } - - // Get the types to match the update_tuple signature - Type* agg_node_type = codegen->get_type(AggregationNode::_s_llvm_class_name); - Type* agg_tuple_type = codegen->get_type(Tuple::_s_llvm_class_name); - Type* tuple_row_type = codegen->get_type(TupleRow::_s_llvm_class_name); - - DCHECK(agg_node_type != NULL); - DCHECK(agg_tuple_type != NULL); - DCHECK(tuple_row_type != NULL); - - PointerType* agg_node_ptr_type = PointerType::get(agg_node_type, 0); - PointerType* agg_tuple_ptr_type = PointerType::get(agg_tuple_type, 0); - PointerType* tuple_row_ptr_type = PointerType::get(tuple_row_type, 0); - - // Signature for update_tuple is - // void update_tuple(AggregationNode* this, Tuple* tuple, TupleRow* row) - // This signature needs to match the non-codegen'd signature exactly. - StructType* tuple_struct = _intermediate_tuple_desc->generate_llvm_struct(codegen); - PointerType* tuple_ptr = PointerType::get(tuple_struct, 0); - LlvmCodeGen::FnPrototype prototype(codegen, "update_tuple", codegen->void_type()); - prototype.add_argument(LlvmCodeGen::NamedVariable("this_ptr", agg_node_ptr_type)); - prototype.add_argument(LlvmCodeGen::NamedVariable("agg_tuple", agg_tuple_ptr_type)); - prototype.add_argument(LlvmCodeGen::NamedVariable("tuple_row", tuple_row_ptr_type)); - - LlvmCodeGen::LlvmBuilder builder(codegen->context()); - Value* args[3]; - Function* fn = prototype.generate_prototype(&builder, &args[0]); - - // Cast the parameter types to the internal llvm runtime types. - // TODO: get rid of this by using right type in function signature - args[1] = builder.CreateBitCast(args[1], tuple_ptr, "tuple"); - - // Loop over each expr and generate the IR for that slot. If the expr is not - // count(*), generate a helper IR function to update the slot and call that. - j = _probe_expr_ctxs.size(); - for (int i = 0; i < _aggregate_evaluators.size(); ++i, ++j) { - // skip non-materialized slots; we don't have evaluators instantiated for those - while (!_intermediate_tuple_desc->slots()[j]->is_materialized()) { - DCHECK_LT(j, _intermediate_tuple_desc->slots().size() - 1); - ++j; - } - SlotDescriptor* slot_desc = _intermediate_tuple_desc->slots()[j]; - AggFnEvaluator* evaluator = _aggregate_evaluators[i]; - if (evaluator->is_count_star()) { - // TODO: we should be able to hoist this up to the loop over the batch and just - // increment the slot by the number of rows in the batch. - int field_idx = slot_desc->field_idx(); - Value* const_one = codegen->get_int_constant(TYPE_BIGINT, 1); - Value* slot_ptr = builder.CreateStructGEP(args[1], field_idx, "src_slot"); - Value* slot_loaded = builder.CreateLoad(slot_ptr, "count_star_val"); - Value* count_inc = builder.CreateAdd(slot_loaded, const_one, "count_star_inc"); - builder.CreateStore(count_inc, slot_ptr); - } else { - Function* update_slot_fn = codegen_update_slot(state, evaluator, slot_desc); - if (update_slot_fn == NULL) { - return NULL; - } - Value* fn_ctx_arg = codegen->cast_ptr_to_llvm_ptr( - codegen->get_ptr_type(FunctionContextImpl::_s_llvm_functioncontext_name), - _agg_fn_ctxs[i]); - builder.CreateCall3(update_slot_fn, fn_ctx_arg, args[1], args[2]); - } - } - builder.CreateRetVoid(); - - // CodegenProcessRowBatch() does the final optimizations. - return codegen->finalize_function(fn); -} - -Function* AggregationNode::codegen_process_row_batch( - RuntimeState* state, Function* update_tuple_fn) { - LlvmCodeGen* codegen = NULL; - if (!state->get_codegen(&codegen).ok()) { - return NULL; - } - SCOPED_TIMER(codegen->codegen_timer()); - DCHECK(update_tuple_fn != NULL); - - // Get the cross compiled update row batch function - IRFunction::Type ir_fn = - (!_probe_expr_ctxs.empty() ? IRFunction::AGG_NODE_PROCESS_ROW_BATCH_WITH_GROUPING - : IRFunction::AGG_NODE_PROCESS_ROW_BATCH_NO_GROUPING); - Function* process_batch_fn = codegen->get_function(ir_fn); - if (process_batch_fn == NULL) { - LOG(ERROR) << "Could not find AggregationNode::ProcessRowBatch in module."; - return NULL; - } - - int replaced = 0; - if (!_probe_expr_ctxs.empty()) { - // Aggregation w/o grouping does not use a hash table. - - // Codegen for hash - Function* hash_fn = _hash_tbl->codegen_hash_current_row(state); - if (hash_fn == NULL) { - return NULL; - } - - // Codegen HashTable::Equals - Function* equals_fn = _hash_tbl->codegen_equals(state); - if (equals_fn == NULL) { - return NULL; - } - - // Codegen for evaluating build rows - Function* eval_build_row_fn = _hash_tbl->codegen_eval_tuple_row(state, true); - if (eval_build_row_fn == NULL) { - return NULL; - } - - // Codegen for evaluating probe rows - Function* eval_probe_row_fn = _hash_tbl->codegen_eval_tuple_row(state, false); - if (eval_probe_row_fn == NULL) { - return NULL; - } - - // Replace call sites - process_batch_fn = codegen->replace_call_sites( - process_batch_fn, false, eval_build_row_fn, "eval_build_row", &replaced); - DCHECK_EQ(replaced, 1); - - process_batch_fn = codegen->replace_call_sites( - process_batch_fn, false, eval_probe_row_fn, "eval_probe_row", &replaced); - DCHECK_EQ(replaced, 1); - - process_batch_fn = codegen->replace_call_sites( - process_batch_fn, false, hash_fn, "hash_current_row", &replaced); - DCHECK_EQ(replaced, 2); - - process_batch_fn = codegen->replace_call_sites( - process_batch_fn, false, equals_fn, "equals", &replaced); - DCHECK_EQ(replaced, 1); - } - - process_batch_fn = codegen->replace_call_sites( - process_batch_fn, false, update_tuple_fn, "update_tuple", &replaced); - DCHECK_EQ(replaced, 1) << "One call site should be replaced."; - DCHECK(process_batch_fn != NULL); - return codegen->optimize_function_with_exprs(process_batch_fn); -} -} - +#include "exec/aggregation_node.h" + +#include +#include +#include +#include +#include +#include + +#include "codegen/codegen_anyval.h" +#include "codegen/llvm_codegen.h" +#include "exec/hash_table.hpp" +#include "exprs/agg_fn_evaluator.h" +#include "exprs/expr.h" +#include "exprs/slot_ref.h" +#include "gen_cpp/Exprs_types.h" +#include "gen_cpp/PlanNodes_types.h" +#include "runtime/descriptors.h" +#include "runtime/mem_pool.h" +#include "runtime/raw_value.h" +#include "runtime/row_batch.h" +#include "runtime/runtime_state.h" +#include "runtime/string_value.hpp" +#include "runtime/tuple.h" +#include "runtime/tuple_row.h" +#include "util/debug_util.h" +#include "util/runtime_profile.h" + +using llvm::BasicBlock; +using llvm::Function; +using llvm::PointerType; +using llvm::Type; +using llvm::Value; +using llvm::StructType; + +namespace palo { + +const char* AggregationNode::_s_llvm_class_name = "class.palo::AggregationNode"; + +// TODO: pass in maximum size; enforce by setting limit in mempool +// TODO: have a Status ExecNode::init(const TPlanNode&) member function +// that does initialization outside of c'tor, so we can indicate errors +AggregationNode::AggregationNode( + ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs) : + ExecNode(pool, tnode, descs), + _intermediate_tuple_id(tnode.agg_node.intermediate_tuple_id), + _intermediate_tuple_desc(NULL), + _output_tuple_id(tnode.agg_node.output_tuple_id), + _output_tuple_desc(NULL), + _singleton_output_tuple(NULL), + //_tuple_pool(new MemPool()), + // + _codegen_process_row_batch_fn(NULL), + _process_row_batch_fn(NULL), + _needs_finalize(tnode.agg_node.need_finalize), + _build_timer(NULL), + _get_results_timer(NULL), + _hash_table_buckets_counter(NULL) { +} + +AggregationNode::~AggregationNode() { +} + +Status AggregationNode::init(const TPlanNode& tnode, RuntimeState* state) { + RETURN_IF_ERROR(ExecNode::init(tnode, state)); + // ignore return status for now , so we need to introduct ExecNode::init() + RETURN_IF_ERROR(Expr::create_expr_trees( + _pool, tnode.agg_node.grouping_exprs, &_probe_expr_ctxs)); + + for (int i = 0; i < tnode.agg_node.aggregate_functions.size(); ++i) { + AggFnEvaluator* evaluator = NULL; + AggFnEvaluator::create( + _pool, tnode.agg_node.aggregate_functions[i], &evaluator); + _aggregate_evaluators.push_back(evaluator); + } + return Status::OK; +} + +Status AggregationNode::prepare(RuntimeState* state) { + RETURN_IF_ERROR(ExecNode::prepare(state)); + _build_timer = ADD_TIMER(runtime_profile(), "BuildTime"); + _get_results_timer = ADD_TIMER(runtime_profile(), "GetResultsTime"); + _hash_table_buckets_counter = + ADD_COUNTER(runtime_profile(), "BuildBuckets", TUnit::UNIT); + _hash_table_load_factor_counter = + ADD_COUNTER(runtime_profile(), "LoadFactor", TUnit::DOUBLE_VALUE); + + SCOPED_TIMER(_runtime_profile->total_time_counter()); + + _intermediate_tuple_desc = + state->desc_tbl().get_tuple_descriptor(_intermediate_tuple_id); + _output_tuple_desc = state->desc_tbl().get_tuple_descriptor(_output_tuple_id); + DCHECK_EQ(_intermediate_tuple_desc->slots().size(), _output_tuple_desc->slots().size()); + RETURN_IF_ERROR(Expr::prepare( + _probe_expr_ctxs, state, child(0)->row_desc(), expr_mem_tracker())); + + // Construct build exprs from _agg_tuple_desc + for (int i = 0; i < _probe_expr_ctxs.size(); ++i) { + SlotDescriptor* desc = _intermediate_tuple_desc->slots()[i]; + Expr* expr = new SlotRef(desc); + state->obj_pool()->add(expr); + _build_expr_ctxs.push_back(new ExprContext(expr)); + state->obj_pool()->add(_build_expr_ctxs.back()); + } + + // Construct a new row desc for preparing the build exprs because neither the child's + // nor this node's output row desc may contain the intermediate tuple, e.g., + // in a single-node plan with an intermediate tuple different from the output tuple. + RowDescriptor build_row_desc(_intermediate_tuple_desc, false); + RETURN_IF_ERROR(Expr::prepare( + _build_expr_ctxs, state, build_row_desc, expr_mem_tracker())); + + _tuple_pool.reset(new MemPool(mem_tracker())); + + _agg_fn_ctxs.resize(_aggregate_evaluators.size()); + int j = _probe_expr_ctxs.size(); + for (int i = 0; i < _aggregate_evaluators.size(); ++i, ++j) { + // skip non-materialized slots; we don't have evaluators instantiated for those + // while (!_agg_tuple_desc->slots()[j]->is_materialized()) { + // DCHECK_LT(j, _agg_tuple_desc->slots().size() - 1) + // << "#eval= " << _aggregate_evaluators.size() + // << " #probe=" << _probe_expr_ctxs.size(); + // ++j; + // } + SlotDescriptor* intermediate_slot_desc = _intermediate_tuple_desc->slots()[j]; + SlotDescriptor* output_slot_desc = _output_tuple_desc->slots()[j]; + RETURN_IF_ERROR(_aggregate_evaluators[i]->prepare( + state, child(0)->row_desc(), _tuple_pool.get(), + intermediate_slot_desc, output_slot_desc, mem_tracker(), &_agg_fn_ctxs[i])); + state->obj_pool()->add(_agg_fn_ctxs[i]); + } + + // TODO: how many buckets? + _hash_tbl.reset(new HashTable( + _build_expr_ctxs, _probe_expr_ctxs, 1, true, id(), mem_tracker(), 1024)); + + if (_probe_expr_ctxs.empty()) { + // create single output tuple now; we need to output something + // even if our input is empty + _singleton_output_tuple = construct_intermediate_tuple(); + } + + if (state->codegen_level() > 0) { + LlvmCodeGen* codegen = NULL; + RETURN_IF_ERROR(state->get_codegen(&codegen)); + Function* update_tuple_fn = codegen_update_tuple(state); + if (update_tuple_fn != NULL) { + _codegen_process_row_batch_fn = + codegen_process_row_batch(state, update_tuple_fn); + if (_codegen_process_row_batch_fn != NULL) { + // Update to using codegen'd process row batch. + codegen->add_function_to_jit(_codegen_process_row_batch_fn, + reinterpret_cast(&_process_row_batch_fn)); + // AddRuntimeExecOption("Codegen Enabled"); + } + } + } + + return Status::OK; +} + +Status AggregationNode::open(RuntimeState* state) { + RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::OPEN)); + SCOPED_TIMER(_runtime_profile->total_time_counter()); + RETURN_IF_ERROR(ExecNode::open(state)); + RETURN_IF_ERROR(Expr::open(_probe_expr_ctxs, state)); + RETURN_IF_ERROR(Expr::open(_build_expr_ctxs, state)); + + for (int i = 0; i < _aggregate_evaluators.size(); ++i) { + RETURN_IF_ERROR(_aggregate_evaluators[i]->open(state, _agg_fn_ctxs[i])); + } + + RETURN_IF_ERROR(_children[0]->open(state)); + + RowBatch batch(_children[0]->row_desc(), state->batch_size(), mem_tracker()); + int64_t num_input_rows = 0; + int64_t num_agg_rows = 0; + + bool early_return = false; + bool limit_with_no_agg = (limit() != -1 && (_aggregate_evaluators.size() == 0)); + DCHECK_EQ(_aggregate_evaluators.size(), _agg_fn_ctxs.size()); + + while (true) { + bool eos = false; + RETURN_IF_CANCELLED(state); + RETURN_IF_ERROR(state->check_query_state()); + RETURN_IF_ERROR(_children[0]->get_next(state, &batch, &eos)); + // SCOPED_TIMER(_build_timer); + if (VLOG_ROW_IS_ON) { + for (int i = 0; i < batch.num_rows(); ++i) { + TupleRow* row = batch.get_row(i); + VLOG_ROW << "id=" << id() << " input row: " + << print_row(row, _children[0]->row_desc()); + } + } + + int64_t agg_rows_before = _hash_tbl->size(); + + if (_process_row_batch_fn != NULL) { + _process_row_batch_fn(this, &batch); + } else if (_singleton_output_tuple != NULL) { + SCOPED_TIMER(_build_timer); + process_row_batch_no_grouping(&batch, _tuple_pool.get()); + } else { + process_row_batch_with_grouping(&batch, _tuple_pool.get()); + if (limit_with_no_agg) { + if (_hash_tbl->size() >= limit()) { + early_return = true; + } + } + } + + // RETURN_IF_LIMIT_EXCEEDED(state); + RETURN_IF_ERROR(state->check_query_state()); + + COUNTER_SET(_hash_table_buckets_counter, _hash_tbl->num_buckets()); + COUNTER_SET(memory_used_counter(), + _tuple_pool->peak_allocated_bytes() + _hash_tbl->byte_size()); + COUNTER_SET(_hash_table_load_factor_counter, _hash_tbl->load_factor()); + num_agg_rows += (_hash_tbl->size() - agg_rows_before); + num_input_rows += batch.num_rows(); + + batch.reset(); + + RETURN_IF_ERROR(state->check_query_state()); + if (eos) { + break; + } + if (early_return) { + break; + } + } + + if (_singleton_output_tuple != NULL) { + _hash_tbl->insert(reinterpret_cast(&_singleton_output_tuple)); + ++num_agg_rows; + } + + VLOG_ROW << "id=" << id() << " aggregated " << num_input_rows << " input rows into " + << num_agg_rows << " output rows"; + _output_iterator = _hash_tbl->begin(); + return Status::OK; +} + +Status AggregationNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { + SCOPED_TIMER(_runtime_profile->total_time_counter()); + RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); + RETURN_IF_CANCELLED(state); + RETURN_IF_ERROR(state->check_query_state()); + SCOPED_TIMER(_get_results_timer); + + if (reached_limit()) { + *eos = true; + return Status::OK; + } + + ExprContext** ctxs = &_conjunct_ctxs[0]; + int num_ctxs = _conjunct_ctxs.size(); + + int count = 0; + const int N = state->batch_size(); + while (!_output_iterator.at_end() && !row_batch->at_capacity()) { + // This loop can go on for a long time if the conjuncts are very selective. Do query + // maintenance every N iterations. + if (count++ % N == 0) { + RETURN_IF_CANCELLED(state); + RETURN_IF_ERROR(state->check_query_state()); + } + int row_idx = row_batch->add_row(); + TupleRow* row = row_batch->get_row(row_idx); + Tuple* intermediate_tuple = _output_iterator.get_row()->get_tuple(0); + Tuple* output_tuple = + finalize_tuple(intermediate_tuple, row_batch->tuple_data_pool()); + row->set_tuple(0, output_tuple); + + if (ExecNode::eval_conjuncts(ctxs, num_ctxs, row)) { + VLOG_ROW << "output row: " << print_row(row, row_desc()); + row_batch->commit_last_row(); + ++_num_rows_returned; + + if (reached_limit()) { + break; + } + } + + _output_iterator.next(); + } + + *eos = _output_iterator.at_end() || reached_limit(); + if (*eos) { + if (memory_used_counter() != NULL && _hash_tbl.get() != NULL && + _hash_table_buckets_counter != NULL) { + COUNTER_SET(memory_used_counter(), + _tuple_pool->peak_allocated_bytes() + _hash_tbl->byte_size()); + COUNTER_SET(_hash_table_buckets_counter, _hash_tbl->num_buckets()); + } + } + COUNTER_SET(_rows_returned_counter, _num_rows_returned); + return Status::OK; +} + +Status AggregationNode::close(RuntimeState* state) { + if (is_closed()) { + return Status::OK; + } + + // Iterate through the remaining rows in the hash table and call Serialize/Finalize on + // them in order to free any memory allocated by UDAs. Finalize() requires a dst tuple + // but we don't actually need the result, so allocate a single dummy tuple to avoid + // accumulating memory. + Tuple* dummy_dst = NULL; + if (_needs_finalize && _output_tuple_desc != NULL) { + dummy_dst = Tuple::create(_output_tuple_desc->byte_size(), _tuple_pool.get()); + } + while (!_output_iterator.at_end()) { + Tuple* tuple = _output_iterator.get_row()->get_tuple(0); + if (_needs_finalize) { + AggFnEvaluator::finalize(_aggregate_evaluators, _agg_fn_ctxs, tuple, dummy_dst); + } else { + AggFnEvaluator::serialize(_aggregate_evaluators, _agg_fn_ctxs, tuple); + } + _output_iterator.next(); + } + + for (int i = 0; i < _aggregate_evaluators.size(); ++i) { + _aggregate_evaluators[i]->close(state); + if (!_agg_fn_ctxs.empty() && _agg_fn_ctxs[i] && _agg_fn_ctxs[i]->impl()) { + _agg_fn_ctxs[i]->impl()->close(); + } + } + + if (_tuple_pool.get() != NULL) { + _tuple_pool->free_all(); + } + if (_hash_tbl.get() != NULL) { + _hash_tbl->close(); + } + + Expr::close(_probe_expr_ctxs, state); + Expr::close(_build_expr_ctxs, state); + + return ExecNode::close(state); +} + +Tuple* AggregationNode::construct_intermediate_tuple() { + Tuple* agg_tuple = Tuple::create(_intermediate_tuple_desc->byte_size(), _tuple_pool.get()); + vector::const_iterator slot_desc = _intermediate_tuple_desc->slots().begin(); + + // copy grouping values + for (int i = 0; i < _probe_expr_ctxs.size(); ++i, ++slot_desc) { + if (_hash_tbl->last_expr_value_null(i)) { + agg_tuple->set_null((*slot_desc)->null_indicator_offset()); + } else { + void* src = _hash_tbl->last_expr_value(i); + void* dst = agg_tuple->get_slot((*slot_desc)->tuple_offset()); + RawValue::write(src, dst, (*slot_desc)->type(), _tuple_pool.get()); + } + } + + // Initialize aggregate output. + for (int i = 0; i < _aggregate_evaluators.size(); ++i, ++slot_desc) { + while (!(*slot_desc)->is_materialized()) { + ++slot_desc; + } + + AggFnEvaluator* evaluator = _aggregate_evaluators[i]; + evaluator->init(_agg_fn_ctxs[i], agg_tuple); + + // Codegen specific path. + // To minimize branching on the UpdateAggTuple path, initialize the result value + // so that UpdateAggTuple doesn't have to check if the aggregation + // dst slot is null. + // - sum/count: 0 + // - min: max_value + // - max: min_value + // TODO: remove when we don't use the irbuilder for codegen here. + // This optimization no longer applies with AnyVal + if (!(*slot_desc)->type().is_string_type() && + !(*slot_desc)->type().is_date_type()) { + ExprValue default_value; + void* default_value_ptr = NULL; + + switch (evaluator->agg_op()) { + case TAggregationOp::MIN: + default_value_ptr = default_value.set_to_max((*slot_desc)->type()); + RawValue::write(default_value_ptr, agg_tuple, *slot_desc, NULL); + break; + + case TAggregationOp::MAX: + default_value_ptr = default_value.set_to_min((*slot_desc)->type()); + RawValue::write(default_value_ptr, agg_tuple, *slot_desc, NULL); + break; + + default: + break; + } + } + } + + return agg_tuple; +} + +void AggregationNode::update_tuple(Tuple* tuple, TupleRow* row) { + DCHECK(tuple != NULL); + + AggFnEvaluator::add(_aggregate_evaluators, _agg_fn_ctxs, row, tuple); +#if 0 + vector::const_iterator evaluator; + int i = 0; + for (evaluator = _aggregate_evaluators.begin(); + evaluator != _aggregate_evaluators.end(); ++evaluator, ++i) { + (*evaluator)->choose_update_or_merge(_agg_fn_ctxs[i], row, tuple); + //if (_is_merge) { + // (*evaluator)->merge(_agg_fn_ctxs[i], row, tuple, pool); + //} else { + // (*evaluator)->update(_agg_fn_ctxs[i], row, tuple, pool); + //} + } +#endif +} + +Tuple* AggregationNode::finalize_tuple(Tuple* tuple, MemPool* pool) { + DCHECK(tuple != NULL); + + Tuple* dst = tuple; + if (_needs_finalize && _intermediate_tuple_id != _output_tuple_id) { + dst = Tuple::create(_output_tuple_desc->byte_size(), pool); + } + if (_needs_finalize) { + AggFnEvaluator::finalize(_aggregate_evaluators, _agg_fn_ctxs, tuple, dst); + } else { + AggFnEvaluator::serialize(_aggregate_evaluators, _agg_fn_ctxs, tuple); + } + // Copy grouping values from tuple to dst. + // TODO: Codegen this. + if (dst != tuple) { + int num_grouping_slots = _probe_expr_ctxs.size(); + for (int i = 0; i < num_grouping_slots; ++i) { + SlotDescriptor* src_slot_desc = _intermediate_tuple_desc->slots()[i]; + SlotDescriptor* dst_slot_desc = _output_tuple_desc->slots()[i]; + bool src_slot_null = tuple->is_null(src_slot_desc->null_indicator_offset()); + void* src_slot = NULL; + if (!src_slot_null) src_slot = tuple->get_slot(src_slot_desc->tuple_offset()); + RawValue::write(src_slot, dst, dst_slot_desc, NULL); + } + } + return dst; +} + +void AggregationNode::debug_string(int indentation_level, std::stringstream* out) const { + *out << std::string(indentation_level * 2, ' '); + *out << "AggregationNode(intermediate_tuple_id=" << _intermediate_tuple_id + << " output_tuple_id=" << _output_tuple_id + << " needs_finalize=" << _needs_finalize + // << " probe_exprs=" << Expr::debug_string(_probe_exprs) + << " agg_exprs=" << AggFnEvaluator::debug_string(_aggregate_evaluators); + ExecNode::debug_string(indentation_level, out); + *out << ")"; +} + +void AggregationNode::push_down_predicate(RuntimeState *state, + std::list *expr_ctxs) { + // groupby can pushdown, agg can't pushdown + // Now we doesn't pushdown for easy. + return; +} + +static IRFunction::Type get_hll_update_function2(const TypeDescriptor& type) { + switch (type.type) { + case TYPE_BOOLEAN: + return IRFunction::HLL_UPDATE_BOOLEAN; + case TYPE_TINYINT: + return IRFunction::HLL_UPDATE_TINYINT; + case TYPE_SMALLINT: + return IRFunction::HLL_UPDATE_SMALLINT; + case TYPE_INT: + return IRFunction::HLL_UPDATE_INT; + case TYPE_BIGINT: + return IRFunction::HLL_UPDATE_BIGINT; + case TYPE_FLOAT: + return IRFunction::HLL_UPDATE_FLOAT; + case TYPE_DOUBLE: + return IRFunction::HLL_UPDATE_DOUBLE; + case TYPE_CHAR: + case TYPE_VARCHAR: + return IRFunction::HLL_UPDATE_STRING; + case TYPE_DECIMAL: + return IRFunction::HLL_UPDATE_DECIMAL; + default: + DCHECK(false) << "Unsupported type: " << type; + return IRFunction::FN_END; + } +} + +// IR Generation for updating a single aggregation slot. Signature is: +// void update_slot(FunctionContext* fn_ctx, AggTuple* agg_tuple, char** row) +// +// The IR for sum(double_col) is: +// define void @update_slot(%"class.palo_udf::FunctionContext"* %fn_ctx, +// { i8, double }* %agg_tuple, +// %"class.palo::TupleRow"* %row) #20 { +// entry: +// %src = call { i8, double } @GetSlotRef(%"class.palo::ExprContext"* inttoptr +// (i64 128241264 to %"class.palo::ExprContext"*), %"class.palo::TupleRow"* %row) +// %0 = extractvalue { i8, double } %src, 0 +// %is_null = trunc i8 %0 to i1 +// br i1 %is_null, label %ret, label %src_not_null +// +// src_not_null: ; preds = %entry +// %dst_slot_ptr = getelementptr inbounds { i8, double }* %agg_tuple, i32 0, i32 1 +// call void @SetNotNull({ i8, double }* %agg_tuple) +// %dst_val = load double* %dst_slot_ptr +// %val = extractvalue { i8, double } %src, 1 +// %1 = fadd double %dst_val, %val +// store double %1, double* %dst_slot_ptr +// br label %ret +// +// ret: ; preds = %src_not_null, %entry +// ret void +// } +// +// The IR for min(double_col) is: +// define void @update_slot(%"class.palo_udf::FunctionContext"* %fn_ctx, +// { i8, double }* %agg_tuple, +// %"class.palo::TupleRow"* %row) #20 { +// entry: +// %src = call { i8, double } @GetSlotRef(%"class.palo::ExprContext"* inttoptr +// (i64 128241264 to %"class.palo::ExprContext"*), %"class.palo::TupleRow"* %row) +// %0 = extractvalue { i8, double } %src, 0 +// %is_null = trunc i8 %0 to i1 +// br i1 %is_null, label %ret, label %src_not_null +// +// src_not_null: ; preds = %entry +// %dst_is_null = call i8 @is_null(tuple); +// br i1 %dst_is_null, label dst_null, label dst_not_null +// +// dst_null: ; preds = %entry +// %dst_slot_ptr = getelementptr inbounds { i8, double }* %agg_tuple, i32 0, i32 1 +// call void @SetNotNull({ i8, double }* %agg_tuple) +// %val = extractvalue { i8, double } %src, 1 +// store double %val, double* %dst_slot_ptr +// br label %ret +// +// dst_not_null: ; preds = %src_not_null +// %dst_slot_ptr = getelementptr inbounds { i8, double }* %agg_tuple, i32 0, i32 1 +// call void @SetNotNull({ i8, double }* %agg_tuple) +// %dst_val = load double* %dst_slot_ptr +// %val = extractvalue { i8, double } %src, 1 +// %1 = fadd double %dst_val, %val +// store double %1, double* %dst_slot_ptr +// br label %ret +// +// ret: ; preds = %src_not_null, %entry +// ret void +// } +// The IR for ndv(double_col) is: +// define void @update_slot(%"class.palo_udf::FunctionContext"* %fn_ctx, +// { i8, %"struct.palo::StringValue" }* %agg_tuple, +// %"class.palo::TupleRow"* %row) #20 { +// entry: +// %dst_lowered_ptr = alloca { i64, i8* } +// %src_lowered_ptr = alloca { i8, double } +// %src = call { i8, double } @GetSlotRef(%"class.palo::ExprContext"* inttoptr +// (i64 120530832 to %"class.palo::ExprContext"*), %"class.palo::TupleRow"* %row) +// %0 = extractvalue { i8, double } %src, 0 +// %is_null = trunc i8 %0 to i1 +// br i1 %is_null, label %ret, label %src_not_null +// +// src_not_null: ; preds = %entry +// %dst_slot_ptr = getelementptr inbounds +// { i8, %"struct.palo::StringValue" }* %agg_tuple, i32 0, i32 1 +// call void @SetNotNull({ i8, %"struct.palo::StringValue" }* %agg_tuple) +// %dst_val = load %"struct.palo::StringValue"* %dst_slot_ptr +// store { i8, double } %src, { i8, double }* %src_lowered_ptr +// %src_unlowered_ptr = bitcast { i8, double }* %src_lowered_ptr +// to %"struct.palo_udf::DoubleVal"* +// %ptr = extractvalue %"struct.palo::StringValue" %dst_val, 0 +// %dst_stringval = insertvalue { i64, i8* } zeroinitializer, i8* %ptr, 1 +// %len = extractvalue %"struct.palo::StringValue" %dst_val, 1 +// %1 = extractvalue { i64, i8* } %dst_stringval, 0 +// %2 = zext i32 %len to i64 +// %3 = shl i64 %2, 32 +// %4 = and i64 %1, 4294967295 +// %5 = or i64 %4, %3 +// %dst_stringval1 = insertvalue { i64, i8* } %dst_stringval, i64 %5, 0 +// store { i64, i8* } %dst_stringval1, { i64, i8* }* %dst_lowered_ptr +// %dst_unlowered_ptr = bitcast { i64, i8* }* %dst_lowered_ptr +// to %"struct.palo_udf::StringVal"* +// call void @HllUpdate(%"class.palo_udf::FunctionContext"* %fn_ctx, +// %"struct.palo_udf::DoubleVal"* %src_unlowered_ptr, +// %"struct.palo_udf::StringVal"* %dst_unlowered_ptr) +// %anyval_result = load { i64, i8* }* %dst_lowered_ptr +// %6 = extractvalue { i64, i8* } %anyval_result, 1 +// %7 = insertvalue %"struct.palo::StringValue" zeroinitializer, i8* %6, 0 +// %8 = extractvalue { i64, i8* } %anyval_result, 0 +// %9 = ashr i64 %8, 32 +// %10 = trunc i64 %9 to i32 +// %11 = insertvalue %"struct.palo::StringValue" %7, i32 %10, 1 +// store %"struct.palo::StringValue" %11, %"struct.palo::StringValue"* %dst_slot_ptr +// br label %ret +// +// ret: ; preds = %src_not_null, %entry +// ret void +// } +llvm::Function* AggregationNode::codegen_update_slot( + RuntimeState* state, AggFnEvaluator* evaluator, SlotDescriptor* slot_desc) { + DCHECK(slot_desc->is_materialized()); + LlvmCodeGen* codegen = NULL; + if (!state->get_codegen(&codegen).ok()) { + return NULL; + } + + DCHECK_EQ(evaluator->input_expr_ctxs().size(), 1); + ExprContext* input_expr_ctx = evaluator->input_expr_ctxs()[0]; + Expr* input_expr = input_expr_ctx->root(); + // TODO: implement timestamp + if (input_expr->type().type == TYPE_DATETIME + || input_expr->type().type == TYPE_DATE + || input_expr->type().type == TYPE_DECIMAL + || input_expr->type().is_string_type()) { + return NULL; + } + Function* agg_expr_fn = NULL; + Status status = input_expr->get_codegend_compute_fn(state, &agg_expr_fn); + if (!status.ok()) { + LOG(INFO) << "Could not codegen update_slot(): " << status.get_error_msg(); + return NULL; + } + DCHECK(agg_expr_fn != NULL); + + PointerType* fn_ctx_type = + codegen->get_ptr_type(FunctionContextImpl::_s_llvm_functioncontext_name); + StructType* tuple_struct = _intermediate_tuple_desc->generate_llvm_struct(codegen); + PointerType* tuple_ptr_type = PointerType::get(tuple_struct, 0); + PointerType* tuple_row_ptr_type = codegen->get_ptr_type(TupleRow::_s_llvm_class_name); + + // Create update_slot prototype + LlvmCodeGen::FnPrototype prototype(codegen, "update_slot", codegen->void_type()); + prototype.add_argument(LlvmCodeGen::NamedVariable("fn_ctx", fn_ctx_type)); + prototype.add_argument(LlvmCodeGen::NamedVariable("agg_tuple", tuple_ptr_type)); + prototype.add_argument(LlvmCodeGen::NamedVariable("row", tuple_row_ptr_type)); + + LlvmCodeGen::LlvmBuilder builder(codegen->context()); + Value* args[3]; + Function* fn = prototype.generate_prototype(&builder, &args[0]); + Value* fn_ctx_arg = args[0]; + Value* agg_tuple_arg = args[1]; + Value* row_arg = args[2]; + + BasicBlock* src_not_null_block = NULL; + BasicBlock* dst_null_block = NULL; + BasicBlock* dst_not_null_block = NULL; + if (evaluator->agg_op() == AggFnEvaluator::MIN + || evaluator->agg_op() == AggFnEvaluator::MAX) { + src_not_null_block = BasicBlock::Create(codegen->context(), "src_not_null", fn); + dst_null_block = BasicBlock::Create(codegen->context(), "dst_null", fn); + } + dst_not_null_block = BasicBlock::Create(codegen->context(), "dst_not_null", fn); + BasicBlock* ret_block = BasicBlock::Create(codegen->context(), "ret", fn); + + // Call expr function to get src slot value + Value* ctx_arg = codegen->cast_ptr_to_llvm_ptr( + codegen->get_ptr_type(ExprContext::_s_llvm_class_name), input_expr_ctx); + Value* agg_expr_fn_args[] = { ctx_arg, row_arg }; + CodegenAnyVal src = CodegenAnyVal::create_call_wrapped( + codegen, &builder, input_expr->type(), agg_expr_fn, agg_expr_fn_args, "src", NULL); + + Value* src_is_null = src.get_is_null(); + if (evaluator->agg_op() == AggFnEvaluator::MIN + || evaluator->agg_op() == AggFnEvaluator::MAX) { + builder.CreateCondBr(src_is_null, ret_block, src_not_null_block); + + // Src slot is not null + builder.SetInsertPoint(src_not_null_block); + Function* is_null_fn = slot_desc->codegen_is_null(codegen, tuple_struct); + Value* dst_is_null = builder.CreateCall(is_null_fn, agg_tuple_arg); + builder.CreateCondBr(dst_is_null, dst_null_block, dst_not_null_block); + // dst slot is null + builder.SetInsertPoint(dst_null_block); + Value* dst_ptr = + builder.CreateStructGEP(agg_tuple_arg, slot_desc->field_idx(), "dst_slot_ptr"); + if (slot_desc->is_nullable()) { + // Dst is NULL, just update dst slot to src slot and clear null bit + Function* clear_null_fn = slot_desc->codegen_update_null(codegen, tuple_struct, false); + builder.CreateCall(clear_null_fn, agg_tuple_arg); + } + builder.CreateStore(src.get_val(), dst_ptr); + builder.CreateBr(ret_block); + } else { + builder.CreateCondBr(src_is_null, ret_block, dst_not_null_block); + } + + + // Src slot is not null, update dst_slot + builder.SetInsertPoint(dst_not_null_block); + Value* dst_ptr = + builder.CreateStructGEP(agg_tuple_arg, slot_desc->field_idx(), "dst_slot_ptr"); + Value* result = NULL; + + if (slot_desc->is_nullable()) { + // Dst is NULL, just update dst slot to src slot and clear null bit + Function* clear_null_fn = slot_desc->codegen_update_null(codegen, tuple_struct, false); + builder.CreateCall(clear_null_fn, agg_tuple_arg); + } + + // Update the slot + Value* dst_value = builder.CreateLoad(dst_ptr, "dst_val"); + switch (evaluator->agg_op()) { + case AggFnEvaluator::COUNT: + if (evaluator->is_merge()) { + result = builder.CreateAdd(dst_value, src.get_val(), "count_sum"); + } else { + result = builder.CreateAdd( + dst_value, codegen->get_int_constant(TYPE_BIGINT, 1), "count_inc"); + } + break; + case AggFnEvaluator::MIN: { + Function* min_fn = codegen->codegen_min_max(slot_desc->type(), true); + Value* min_args[] = { dst_value, src.get_val() }; + result = builder.CreateCall(min_fn, min_args, "min_value"); + break; + } + case AggFnEvaluator::MAX: { + Function* max_fn = codegen->codegen_min_max(slot_desc->type(), false); + Value* max_args[] = { dst_value, src.get_val() }; + result = builder.CreateCall(max_fn, max_args, "max_value"); + break; + } + case AggFnEvaluator::SUM: + if (slot_desc->type().type == TYPE_FLOAT || slot_desc->type().type == TYPE_DOUBLE) { + result = builder.CreateFAdd(dst_value, src.get_val()); + } else { + result = builder.CreateAdd(dst_value, src.get_val()); + } + break; + case AggFnEvaluator::NDV: { + DCHECK_EQ(slot_desc->type().type, TYPE_VARCHAR); + IRFunction::Type ir_function_type = evaluator->is_merge() ? IRFunction::HLL_MERGE + : get_hll_update_function2(input_expr->type()); + Function* hll_fn = codegen->get_function(ir_function_type); + + // Create pointer to src_anyval to pass to HllUpdate() function. We must use the + // unlowered type. + Value* src_lowered_ptr = codegen->create_entry_block_alloca( + fn, LlvmCodeGen::NamedVariable("src_lowered_ptr", src.value()->getType())); + builder.CreateStore(src.value(), src_lowered_ptr); + Type* unlowered_ptr_type = + CodegenAnyVal::get_unlowered_type(codegen, input_expr->type())->getPointerTo(); + Value* src_unlowered_ptr = + builder.CreateBitCast(src_lowered_ptr, unlowered_ptr_type, "src_unlowered_ptr"); + + // Create StringVal* intermediate argument from dst_value + CodegenAnyVal dst_stringval = CodegenAnyVal::get_non_null_val( + codegen, &builder, TypeDescriptor(TYPE_VARCHAR), "dst_stringval"); + dst_stringval.set_from_raw_value(dst_value); + // Create pointer to dst_stringval to pass to HllUpdate() function. We must use + // the unlowered type. + Value* dst_lowered_ptr = codegen->create_entry_block_alloca( + fn, LlvmCodeGen::NamedVariable("dst_lowered_ptr", + dst_stringval.value()->getType())); + builder.CreateStore(dst_stringval.value(), dst_lowered_ptr); + unlowered_ptr_type = + codegen->get_ptr_type(CodegenAnyVal::get_unlowered_type( + codegen, TypeDescriptor(TYPE_VARCHAR))); + Value* dst_unlowered_ptr = + builder.CreateBitCast(dst_lowered_ptr, unlowered_ptr_type, "dst_unlowered_ptr"); + + // Call 'hll_fn' + builder.CreateCall3(hll_fn, fn_ctx_arg, src_unlowered_ptr, dst_unlowered_ptr); + + // Convert StringVal intermediate 'dst_arg' back to StringValue + Value* anyval_result = builder.CreateLoad(dst_lowered_ptr, "anyval_result"); + result = CodegenAnyVal(codegen, &builder, TypeDescriptor(TYPE_VARCHAR), anyval_result) + .to_native_value(); + break; + } + default: + DCHECK(false) << "bad aggregate operator: " << evaluator->agg_op(); + } + + builder.CreateStore(result, dst_ptr); + builder.CreateBr(ret_block); + + builder.SetInsertPoint(ret_block); + builder.CreateRetVoid(); + + fn = codegen->finalize_function(fn); + return fn; +} + +// IR codegen for the update_tuple loop. This loop is query specific and +// based on the aggregate functions. The function signature must match the non- +// codegen'd update_tuple exactly. +// For the query: +// select count(*), count(int_col), sum(double_col) the IR looks like: +// +// define void @update_tuple(%"class.palo::AggregationNode"* %this_ptr, +// %"class.palo::Tuple"* %agg_tuple, +// %"class.palo::TupleRow"* %tuple_row) #20 { +// entry: +// %tuple = bitcast %"class.palo::Tuple"* %agg_tuple to { i8, i64, i64, double }* +// %src_slot = getelementptr inbounds { i8, i64, i64, double }* %tuple, i32 0, i32 1 +// %count_star_val = load i64* %src_slot +// %count_star_inc = add i64 %count_star_val, 1 +// store i64 %count_star_inc, i64* %src_slot +// call void @update_slot(%"class.palo_udf::FunctionContext"* inttoptr +// (i64 44521296 to %"class.palo_udf::FunctionContext"*), +// { i8, i64, i64, double }* %tuple, +// %"class.palo::TupleRow"* %tuple_row) +// call void @UpdateSlot5(%"class.palo_udf::FunctionContext"* inttoptr +// (i64 44521328 to %"class.palo_udf::FunctionContext"*), +// { i8, i64, i64, double }* %tuple, +// %"class.palo::TupleRow"* %tuple_row) +// ret void +// } +Function* AggregationNode::codegen_update_tuple(RuntimeState* state) { + LlvmCodeGen* codegen = NULL; + if (!state->get_codegen(&codegen).ok()) { + return NULL; + } + SCOPED_TIMER(codegen->codegen_timer()); + + int j = _probe_expr_ctxs.size(); + for (int i = 0; i < _aggregate_evaluators.size(); ++i, ++j) { + // skip non-materialized slots; we don't have evaluators instantiated for those + while (!_intermediate_tuple_desc->slots()[j]->is_materialized()) { + DCHECK_LT(j, _intermediate_tuple_desc->slots().size() - 1); + ++j; + } + SlotDescriptor* slot_desc = _intermediate_tuple_desc->slots()[j]; + AggFnEvaluator* evaluator = _aggregate_evaluators[i]; + + // Timestamp and char are never supported. NDV supports decimal and string but no + // other functions. + // TODO: the other aggregate functions might work with decimal as-is + // TODO(zc) + if (slot_desc->type().type == TYPE_DATETIME || slot_desc->type().type == TYPE_CHAR || + (evaluator->agg_op() != AggFnEvaluator::NDV && + (slot_desc->type().type == TYPE_DECIMAL || + slot_desc->type().type == TYPE_CHAR || + slot_desc->type().type == TYPE_VARCHAR))) { + LOG(INFO) << "Could not codegen UpdateIntermediateTuple because " + << "string, char, timestamp and decimal are not yet supported."; + return NULL; + } + if (evaluator->agg_op() == AggFnEvaluator::COUNT_DISTINCT + || evaluator->agg_op() == AggFnEvaluator::SUM_DISTINCT) { + return NULL; + } + + // Don't codegen things that aren't builtins (for now) + if (!evaluator->is_builtin()) { + return NULL; + } + } + + if (_intermediate_tuple_desc->generate_llvm_struct(codegen) == NULL) { + LOG(INFO) << "Could not codegen update_tuple because we could" + << "not generate a matching llvm struct for the intermediate tuple."; + return NULL; + } + + // Get the types to match the update_tuple signature + Type* agg_node_type = codegen->get_type(AggregationNode::_s_llvm_class_name); + Type* agg_tuple_type = codegen->get_type(Tuple::_s_llvm_class_name); + Type* tuple_row_type = codegen->get_type(TupleRow::_s_llvm_class_name); + + DCHECK(agg_node_type != NULL); + DCHECK(agg_tuple_type != NULL); + DCHECK(tuple_row_type != NULL); + + PointerType* agg_node_ptr_type = PointerType::get(agg_node_type, 0); + PointerType* agg_tuple_ptr_type = PointerType::get(agg_tuple_type, 0); + PointerType* tuple_row_ptr_type = PointerType::get(tuple_row_type, 0); + + // Signature for update_tuple is + // void update_tuple(AggregationNode* this, Tuple* tuple, TupleRow* row) + // This signature needs to match the non-codegen'd signature exactly. + StructType* tuple_struct = _intermediate_tuple_desc->generate_llvm_struct(codegen); + PointerType* tuple_ptr = PointerType::get(tuple_struct, 0); + LlvmCodeGen::FnPrototype prototype(codegen, "update_tuple", codegen->void_type()); + prototype.add_argument(LlvmCodeGen::NamedVariable("this_ptr", agg_node_ptr_type)); + prototype.add_argument(LlvmCodeGen::NamedVariable("agg_tuple", agg_tuple_ptr_type)); + prototype.add_argument(LlvmCodeGen::NamedVariable("tuple_row", tuple_row_ptr_type)); + + LlvmCodeGen::LlvmBuilder builder(codegen->context()); + Value* args[3]; + Function* fn = prototype.generate_prototype(&builder, &args[0]); + + // Cast the parameter types to the internal llvm runtime types. + // TODO: get rid of this by using right type in function signature + args[1] = builder.CreateBitCast(args[1], tuple_ptr, "tuple"); + + // Loop over each expr and generate the IR for that slot. If the expr is not + // count(*), generate a helper IR function to update the slot and call that. + j = _probe_expr_ctxs.size(); + for (int i = 0; i < _aggregate_evaluators.size(); ++i, ++j) { + // skip non-materialized slots; we don't have evaluators instantiated for those + while (!_intermediate_tuple_desc->slots()[j]->is_materialized()) { + DCHECK_LT(j, _intermediate_tuple_desc->slots().size() - 1); + ++j; + } + SlotDescriptor* slot_desc = _intermediate_tuple_desc->slots()[j]; + AggFnEvaluator* evaluator = _aggregate_evaluators[i]; + if (evaluator->is_count_star()) { + // TODO: we should be able to hoist this up to the loop over the batch and just + // increment the slot by the number of rows in the batch. + int field_idx = slot_desc->field_idx(); + Value* const_one = codegen->get_int_constant(TYPE_BIGINT, 1); + Value* slot_ptr = builder.CreateStructGEP(args[1], field_idx, "src_slot"); + Value* slot_loaded = builder.CreateLoad(slot_ptr, "count_star_val"); + Value* count_inc = builder.CreateAdd(slot_loaded, const_one, "count_star_inc"); + builder.CreateStore(count_inc, slot_ptr); + } else { + Function* update_slot_fn = codegen_update_slot(state, evaluator, slot_desc); + if (update_slot_fn == NULL) { + return NULL; + } + Value* fn_ctx_arg = codegen->cast_ptr_to_llvm_ptr( + codegen->get_ptr_type(FunctionContextImpl::_s_llvm_functioncontext_name), + _agg_fn_ctxs[i]); + builder.CreateCall3(update_slot_fn, fn_ctx_arg, args[1], args[2]); + } + } + builder.CreateRetVoid(); + + // CodegenProcessRowBatch() does the final optimizations. + return codegen->finalize_function(fn); +} + +Function* AggregationNode::codegen_process_row_batch( + RuntimeState* state, Function* update_tuple_fn) { + LlvmCodeGen* codegen = NULL; + if (!state->get_codegen(&codegen).ok()) { + return NULL; + } + SCOPED_TIMER(codegen->codegen_timer()); + DCHECK(update_tuple_fn != NULL); + + // Get the cross compiled update row batch function + IRFunction::Type ir_fn = + (!_probe_expr_ctxs.empty() ? IRFunction::AGG_NODE_PROCESS_ROW_BATCH_WITH_GROUPING + : IRFunction::AGG_NODE_PROCESS_ROW_BATCH_NO_GROUPING); + Function* process_batch_fn = codegen->get_function(ir_fn); + if (process_batch_fn == NULL) { + LOG(ERROR) << "Could not find AggregationNode::ProcessRowBatch in module."; + return NULL; + } + + int replaced = 0; + if (!_probe_expr_ctxs.empty()) { + // Aggregation w/o grouping does not use a hash table. + + // Codegen for hash + Function* hash_fn = _hash_tbl->codegen_hash_current_row(state); + if (hash_fn == NULL) { + return NULL; + } + + // Codegen HashTable::Equals + Function* equals_fn = _hash_tbl->codegen_equals(state); + if (equals_fn == NULL) { + return NULL; + } + + // Codegen for evaluating build rows + Function* eval_build_row_fn = _hash_tbl->codegen_eval_tuple_row(state, true); + if (eval_build_row_fn == NULL) { + return NULL; + } + + // Codegen for evaluating probe rows + Function* eval_probe_row_fn = _hash_tbl->codegen_eval_tuple_row(state, false); + if (eval_probe_row_fn == NULL) { + return NULL; + } + + // Replace call sites + process_batch_fn = codegen->replace_call_sites( + process_batch_fn, false, eval_build_row_fn, "eval_build_row", &replaced); + DCHECK_EQ(replaced, 1); + + process_batch_fn = codegen->replace_call_sites( + process_batch_fn, false, eval_probe_row_fn, "eval_probe_row", &replaced); + DCHECK_EQ(replaced, 1); + + process_batch_fn = codegen->replace_call_sites( + process_batch_fn, false, hash_fn, "hash_current_row", &replaced); + DCHECK_EQ(replaced, 2); + + process_batch_fn = codegen->replace_call_sites( + process_batch_fn, false, equals_fn, "equals", &replaced); + DCHECK_EQ(replaced, 1); + } + + process_batch_fn = codegen->replace_call_sites( + process_batch_fn, false, update_tuple_fn, "update_tuple", &replaced); + DCHECK_EQ(replaced, 1) << "One call site should be replaced."; + DCHECK(process_batch_fn != NULL); + return codegen->optimize_function_with_exprs(process_batch_fn); +} +} + diff --git a/be/src/exec/aggregation_node.h b/be/src/exec/aggregation_node.h index 8da5383aee..110d85f3c6 100644 --- a/be/src/exec/aggregation_node.h +++ b/be/src/exec/aggregation_node.h @@ -62,7 +62,7 @@ public: AggregationNode(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs); virtual ~AggregationNode(); - virtual Status init(const TPlanNode& tnode); + virtual Status init(const TPlanNode& tnode, RuntimeState* state = nullptr); virtual Status prepare(RuntimeState* state); virtual Status open(RuntimeState* state); virtual Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos); diff --git a/be/src/exec/analytic_eval_node.cpp b/be/src/exec/analytic_eval_node.cpp index e5ae99bfe4..18ec9e96be 100644 --- a/be/src/exec/analytic_eval_node.cpp +++ b/be/src/exec/analytic_eval_node.cpp @@ -111,8 +111,8 @@ AnalyticEvalNode::AnalyticEvalNode(ObjectPool* pool, const TPlanNode& tnode, VLOG_ROW << "tnode=" << apache::thrift::ThriftDebugString(tnode); } -Status AnalyticEvalNode::init(const TPlanNode& tnode) { - RETURN_IF_ERROR(ExecNode::init(tnode)); +Status AnalyticEvalNode::init(const TPlanNode& tnode, RuntimeState* state) { + RETURN_IF_ERROR(ExecNode::init(tnode, state)); const TAnalyticNode& analytic_node = tnode.analytic_node; bool has_lead_fn = false; @@ -153,7 +153,7 @@ Status AnalyticEvalNode::prepare(RuntimeState* state) { _child_tuple_desc = child(0)->row_desc().tuple_descriptors()[0]; _curr_tuple_pool.reset(new MemPool(mem_tracker())); _prev_tuple_pool.reset(new MemPool(mem_tracker())); - _mem_pool.reset(new MemPool(mem_tracker(), 0)); + _mem_pool.reset(new MemPool(mem_tracker())); _evaluation_timer = ADD_TIMER(runtime_profile(), "EvaluationTime"); diff --git a/be/src/exec/analytic_eval_node.h b/be/src/exec/analytic_eval_node.h index d4d57db71e..adc6fd7840 100644 --- a/be/src/exec/analytic_eval_node.h +++ b/be/src/exec/analytic_eval_node.h @@ -67,7 +67,7 @@ public: ~AnalyticEvalNode() {} AnalyticEvalNode(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs); - virtual Status init(const TPlanNode& tnode); + virtual Status init(const TPlanNode& tnode, RuntimeState* state = nullptr); virtual Status prepare(RuntimeState* state); virtual Status open(RuntimeState* state); virtual Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos); diff --git a/be/src/exec/blocking_join_node.cpp b/be/src/exec/blocking_join_node.cpp index 1e8650e40a..1c032163c5 100644 --- a/be/src/exec/blocking_join_node.cpp +++ b/be/src/exec/blocking_join_node.cpp @@ -43,8 +43,8 @@ BlockingJoinNode::BlockingJoinNode(const std::string& node_name, _join_op(join_op) { } -Status BlockingJoinNode::init(const TPlanNode& tnode) { - return ExecNode::init(tnode); +Status BlockingJoinNode::init(const TPlanNode& tnode, RuntimeState* state) { + return ExecNode::init(tnode, state); } BlockingJoinNode::~BlockingJoinNode() { diff --git a/be/src/exec/blocking_join_node.h b/be/src/exec/blocking_join_node.h index 5a74f90b6c..642061838e 100644 --- a/be/src/exec/blocking_join_node.h +++ b/be/src/exec/blocking_join_node.h @@ -45,7 +45,7 @@ public: // Subclasses should call BlockingJoinNode::init() and then perform any other init() // work, e.g. creating expr trees. - virtual Status init(const TPlanNode& tnode); + virtual Status init(const TPlanNode& tnode, RuntimeState* state = nullptr); // Subclasses should call BlockingJoinNode::prepare() and then perform any other // prepare() work, e.g. codegen. diff --git a/be/src/exec/broker_scan_node.cpp b/be/src/exec/broker_scan_node.cpp index adbdf57c3f..b489374a37 100644 --- a/be/src/exec/broker_scan_node.cpp +++ b/be/src/exec/broker_scan_node.cpp @@ -50,7 +50,7 @@ static bool compare_part_use_range(const PartitionInfo* v1, const PartitionInfo* return v1->range() < v2->range(); } -Status BrokerScanNode::init(const TPlanNode& tnode) { +Status BrokerScanNode::init(const TPlanNode& tnode, RuntimeState* state) { RETURN_IF_ERROR(ScanNode::init(tnode)); auto& broker_scan_node = tnode.broker_scan_node; if (broker_scan_node.__isset.partition_exprs) { diff --git a/be/src/exec/broker_scan_node.h b/be/src/exec/broker_scan_node.h index 9427aa5a59..95d64f4086 100644 --- a/be/src/exec/broker_scan_node.h +++ b/be/src/exec/broker_scan_node.h @@ -40,7 +40,7 @@ public: virtual ~BrokerScanNode(); // Called after create this scan node - virtual Status init(const TPlanNode& tnode) override; + virtual Status init(const TPlanNode& tnode, RuntimeState* state = nullptr) override; // initialize _mysql_scanner, and create _text_converter. virtual Status prepare(RuntimeState* state) override; diff --git a/be/src/exec/data_sink.h b/be/src/exec/data_sink.h index 8e7a4b84c7..6b2cdaa8ea 100644 --- a/be/src/exec/data_sink.h +++ b/be/src/exec/data_sink.h @@ -64,7 +64,11 @@ public: // Further send() calls are illegal after calling close(). // It must be okay to call this multiple times. Subsequent calls should // be ignored. - virtual Status close(RuntimeState* state, Status exec_status) = 0; + virtual Status close(RuntimeState* state, Status exec_status) { + _expr_mem_tracker->close(); + _closed = true; + return Status::OK; + } // Creates a new data sink from thrift_sink. A pointer to the // new sink is written to *sink, and is owned by the caller. diff --git a/be/src/exec/exchange_node.cpp b/be/src/exec/exchange_node.cpp index 827473addb..fc5a50f317 100644 --- a/be/src/exec/exchange_node.cpp +++ b/be/src/exec/exchange_node.cpp @@ -24,6 +24,7 @@ #include "runtime/data_stream_mgr.h" #include "runtime/data_stream_recvr.h" +#include "runtime/exec_env.h" #include "runtime/runtime_state.h" #include "runtime/row_batch.h" #include "util/debug_util.h" @@ -51,8 +52,8 @@ ExchangeNode::ExchangeNode( DCHECK(_is_merging || (_offset == 0)); } -Status ExchangeNode::init(const TPlanNode& tnode) { - RETURN_IF_ERROR(ExecNode::init(tnode)); +Status ExchangeNode::init(const TPlanNode& tnode, RuntimeState* state) { + RETURN_IF_ERROR(ExecNode::init(tnode, state)); if (!_is_merging) { return Status::OK; } diff --git a/be/src/exec/exchange_node.h b/be/src/exec/exchange_node.h index 8168ed2ffa..2712dd2d4d 100644 --- a/be/src/exec/exchange_node.h +++ b/be/src/exec/exchange_node.h @@ -46,7 +46,7 @@ public: ExchangeNode(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs); virtual ~ExchangeNode() {} - virtual Status init(const TPlanNode& tnode); + virtual Status init(const TPlanNode& tnode, RuntimeState* state = nullptr); virtual Status prepare(RuntimeState* state); // Blocks until the first batch is available for consumption via GetNext(). virtual Status open(RuntimeState* state); diff --git a/be/src/exec/exec_node.cpp b/be/src/exec/exec_node.cpp index 79fd0cd805..1af1619de2 100644 --- a/be/src/exec/exec_node.cpp +++ b/be/src/exec/exec_node.cpp @@ -31,6 +31,7 @@ #include "exprs/expr_context.h" #include "exec/aggregation_node.h" #include "exec/partitioned_aggregation_node.h" +#include "exec/new_partitioned_aggregation_node.h" #include "exec/csv_scan_node.h" #include "exec/pre_aggregation_node.h" #include "exec/hash_join_node.h" @@ -50,7 +51,9 @@ #include "exec/analytic_eval_node.h" #include "exec/select_node.h" #include "exec/union_node.h" +#include "runtime/exec_env.h" #include "runtime/descriptors.h" +#include "runtime/initial_reservations.h" #include "runtime/mem_pool.h" #include "runtime/mem_tracker.h" #include "runtime/row_batch.h" @@ -125,6 +128,7 @@ ExecNode::ExecNode(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl _pool(pool), _tuple_ids(tnode.row_tuples), _row_descriptor(descs, tnode.row_tuples, tnode.nullable_tuples), + _resource_profile(tnode.resource_profile), _debug_phase(TExecNodePhase::INVALID), _debug_action(TDebugAction::WAIT), _limit(tnode.limit), @@ -163,7 +167,7 @@ void ExecNode::push_down_predicate( } } -Status ExecNode::init(const TPlanNode& tnode) { +Status ExecNode::init(const TPlanNode& tnode, RuntimeState* state) { RETURN_IF_ERROR( Expr::create_expr_trees(_pool, tnode.conjuncts, &_conjunct_ctxs)); return Status::OK; @@ -184,7 +188,9 @@ Status ExecNode::prepare(RuntimeState* state) { ""); _mem_tracker.reset(new MemTracker(-1, _runtime_profile->name(), state->instance_mem_tracker())); _expr_mem_tracker.reset(new MemTracker(-1, "Exprs", _mem_tracker.get())); + _expr_mem_pool.reset(new MemPool(_expr_mem_tracker.get())); + // TODO chenhao RETURN_IF_ERROR(Expr::prepare(_conjunct_ctxs, state, row_desc(), expr_mem_tracker())); // TODO(zc): // AddExprCtxsToFree(_conjunct_ctxs); @@ -201,6 +207,15 @@ Status ExecNode::open(RuntimeState* state) { return Expr::open(_conjunct_ctxs, state); } + +Status ExecNode::reset(RuntimeState* state) { + _num_rows_returned = 0; + for (int i = 0; i < _children.size(); ++i) { + RETURN_IF_ERROR(_children[i]->reset(state)); + } + return Status::OK; +} + Status ExecNode::close(RuntimeState* state) { if (_is_closed) { return Status::OK; @@ -219,6 +234,25 @@ Status ExecNode::close(RuntimeState* state) { } Expr::close(_conjunct_ctxs, state); + if (expr_mem_pool() != nullptr) { + _expr_mem_pool->free_all(); + } + + if (_buffer_pool_client.is_registered()) { + VLOG_FILE << _id << " returning reservation " << _resource_profile.min_reservation; + state->initial_reservations()->Return( + &_buffer_pool_client, _resource_profile.min_reservation); + state->exec_env()->buffer_pool()->DeregisterClient(&_buffer_pool_client); + } + + if (_expr_mem_tracker != nullptr) { + _expr_mem_tracker->close(); + } + + if (_mem_tracker != nullptr) { + _mem_tracker->close(); + } + return result; } @@ -235,7 +269,7 @@ void ExecNode::add_runtime_exec_option(const std::string& str) { runtime_profile()->add_info_string("ExecOption", _runtime_exec_options); } -Status ExecNode::create_tree(ObjectPool* pool, const TPlan& plan, +Status ExecNode::create_tree(RuntimeState* state, ObjectPool* pool, const TPlan& plan, const DescriptorTbl& descs, ExecNode** root) { if (plan.nodes.size() == 0) { *root = NULL; @@ -243,7 +277,7 @@ Status ExecNode::create_tree(ObjectPool* pool, const TPlan& plan, } int node_idx = 0; - RETURN_IF_ERROR(create_tree_helper(pool, plan.nodes, descs, NULL, &node_idx, root)); + RETURN_IF_ERROR(create_tree_helper(state, pool, plan.nodes, descs, NULL, &node_idx, root)); if (node_idx + 1 != plan.nodes.size()) { // TODO: print thrift msg for diagnostic purposes. @@ -255,6 +289,7 @@ Status ExecNode::create_tree(ObjectPool* pool, const TPlan& plan, } Status ExecNode::create_tree_helper( + RuntimeState* state, ObjectPool* pool, const vector& tnodes, const DescriptorTbl& descs, @@ -270,7 +305,7 @@ Status ExecNode::create_tree_helper( int num_children = tnodes[*node_idx].num_children; ExecNode* node = NULL; - RETURN_IF_ERROR(create_node(pool, tnodes[*node_idx], descs, &node)); + RETURN_IF_ERROR(create_node(state, pool, tnodes[*node_idx], descs, &node)); // assert(parent != NULL || (node_idx == 0 && root_expr != NULL)); if (parent != NULL) { @@ -281,7 +316,7 @@ Status ExecNode::create_tree_helper( for (int i = 0; i < num_children; i++) { ++*node_idx; - RETURN_IF_ERROR(create_tree_helper(pool, tnodes, descs, node, node_idx, NULL)); + RETURN_IF_ERROR(create_tree_helper(state, pool, tnodes, descs, node, node_idx, NULL)); // we are expecting a child, but have used all nodes // this means we have been given a bad tree and must fail @@ -291,7 +326,7 @@ Status ExecNode::create_tree_helper( } } - RETURN_IF_ERROR(node->init(tnode)); + RETURN_IF_ERROR(node->init(tnode, state)); // build up tree of profiles; add children >0 first, so that when we print // the profile, child 0 is printed last (makes the output more readable) @@ -306,13 +341,12 @@ Status ExecNode::create_tree_helper( return Status::OK; } -Status ExecNode::create_node(ObjectPool* pool, const TPlanNode& tnode, +Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs, ExecNode** node) { std::stringstream error_msg; + VLOG(2) << "tnode:\n" << apache::thrift::ThriftDebugString(tnode); switch (tnode.node_type) { - VLOG(2) << "tnode:\n" << apache::thrift::ThriftDebugString(tnode); - case TPlanNodeType::CSV_SCAN_NODE: *node = pool->add(new CsvScanNode(pool, tnode, descs)); return Status::OK; @@ -332,6 +366,8 @@ Status ExecNode::create_node(ObjectPool* pool, const TPlanNode& tnode, case TPlanNodeType::AGGREGATION_NODE: if (config::enable_partitioned_aggregation) { *node = pool->add(new PartitionedAggregationNode(pool, tnode, descs)); + } else if (config::enable_new_partitioned_aggregation) { + *node = pool->add(new NewPartitionedAggregationNode(pool, tnode, descs)); } else { *node = pool->add(new AggregationNode(pool, tnode, descs)); } @@ -608,4 +644,65 @@ Function* ExecNode::codegen_eval_conjuncts( return codegen->finalize_function(fn); } +Status ExecNode::claim_buffer_reservation(RuntimeState* state) { + DCHECK(!_buffer_pool_client.is_registered()); + BufferPool* buffer_pool = ExecEnv::GetInstance()->buffer_pool(); + // Check the minimum buffer size in case the minimum buffer size used by the planner + // doesn't match this backend's. + std::stringstream ss; + if (_resource_profile.__isset.spillable_buffer_size && + _resource_profile.spillable_buffer_size < buffer_pool->min_buffer_len()) { + ss << "Spillable buffer size for node " << _id << " of " << _resource_profile.spillable_buffer_size + << "bytes is less than the minimum buffer pool buffer size of " + << buffer_pool->min_buffer_len() << "bytes"; + return Status(ss.str()); + } + + ss << print_plan_node_type(_type) << " id=" << _id << " ptr=" << this; + RETURN_IF_ERROR(buffer_pool->RegisterClient(ss.str(), + state->instance_buffer_reservation(), + mem_tracker(), _resource_profile.max_reservation, + runtime_profile(), + &_buffer_pool_client)); + + state->initial_reservations()->Claim(&_buffer_pool_client, _resource_profile.min_reservation); +/* + if (debug_action_ == TDebugAction::SET_DENY_RESERVATION_PROBABILITY && + (debug_phase_ == TExecNodePhase::PREPARE || debug_phase_ == TExecNodePhase::OPEN)) { + // We may not have been able to enable the debug action at the start of Prepare() or + // Open() because the client is not registered then. Do it now to be sure that it is + // effective. + RETURN_IF_ERROR(EnableDenyReservationDebugAction()); + } +*/ + return Status::OK; +} + +Status ExecNode::release_unused_reservation() { + return _buffer_pool_client.DecreaseReservationTo(_resource_profile.min_reservation); +} +/* +Status ExecNode::enable_deny_reservation_debug_action() { + DCHECK_EQ(debug_action_, TDebugAction::SET_DENY_RESERVATION_PROBABILITY); + DCHECK(_buffer_pool_client.is_registered()); + // Parse [0.0, 1.0] probability. + StringParser::ParseResult parse_result; + double probability = StringParser::StringToFloat( + debug_action_param_.c_str(), debug_action_param_.size(), &parse_result); + if (parse_result != StringParser::PARSE_SUCCESS || probability < 0.0 + || probability > 1.0) { + return Status(Substitute( + "Invalid SET_DENY_RESERVATION_PROBABILITY param: '$0'", debug_action_param_)); + } + _buffer_pool_client.SetDebugDenyIncreaseReservation(probability); + return Status::OK(); +} +*/ + +Status ExecNode::QueryMaintenance(RuntimeState* state) { + // TODO chenhao , when introduce latest AnalyticEvalNode open it + // ScalarExprEvaluator::FreeLocalAllocations(evals_to_free_); + return state->check_query_state(); +} + } diff --git a/be/src/exec/exec_node.h b/be/src/exec/exec_node.h index b4e2de9285..45248d6f7b 100644 --- a/be/src/exec/exec_node.h +++ b/be/src/exec/exec_node.h @@ -28,9 +28,10 @@ #include "common/status.h" #include "gen_cpp/PlanNodes_types.h" #include "runtime/descriptors.h" -#include "runtime/mem_tracker.h" +#include "runtime/mem_pool.h" #include "util/runtime_profile.h" #include "util/blocking_queue.hpp" +#include "runtime/bufferpool/buffer_pool.h" namespace llvm { class Function; @@ -70,7 +71,7 @@ public: /// Initializes this object from the thrift tnode desc. The subclass should /// do any initialization that can fail in Init() rather than the ctor. /// If overridden in subclass, must first call superclass's Init(). - virtual Status init(const TPlanNode& tnode); + virtual Status init(const TPlanNode& tnode, RuntimeState* state = nullptr); // Sets up internal structures, etc., without doing any actual work. // Must be called prior to open(). Will only be called once in this @@ -101,6 +102,21 @@ public: // TODO: AggregationNode and HashJoinNode cannot be "re-opened" yet. virtual Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) = 0; + // Resets the stream of row batches to be retrieved by subsequent GetNext() calls. + // Clears all internal state, returning this node to the state it was in after calling + // Prepare() and before calling Open(). This function must not clear memory + // still owned by this node that is backing rows returned in GetNext(). + // Prepare() and Open() must have already been called before calling Reset(). + // GetNext() may have optionally been called (not necessarily until eos). + // Close() must not have been called. + // Reset() is not idempotent. Calling it multiple times in a row without a preceding + // call to Open() is invalid. + // If overridden in a subclass, must call superclass's Reset() at the end. The default + // implementation calls Reset() on children. + // Note that this function may be called many times (proportional to the input data), + // so should be fast. + virtual Status reset(RuntimeState* state); + // close() will get called for every exec node, regardless of what else is called and // the status of these calls (i.e. prepare() may never have been called, or // prepare()/open()/get_next() returned with an error). @@ -124,7 +140,7 @@ public: // Creates exec node tree from list of nodes contained in plan via depth-first // traversal. All nodes are placed in pool. // Returns error if 'plan' is corrupted, otherwise success. - static Status create_tree(ObjectPool* pool, const TPlan& plan, + static Status create_tree(RuntimeState* state, ObjectPool* pool, const TPlan& plan, const DescriptorTbl& descs, ExecNode** root); // Set debug action for node with given id in 'tree' @@ -198,6 +214,10 @@ public: return _expr_mem_tracker.get(); } + MemPool* expr_mem_pool() { + return _expr_mem_pool.get(); + } + // Extract node id from p->name(). static int get_node_id_from_profile(RuntimeProfile* p); @@ -207,6 +227,25 @@ public: protected: friend class DataSink; + /// Initialize 'buffer_pool_client_' and claim the initial reservation for this + /// ExecNode. Only needs to be called by ExecNodes that will use the client. + /// The client is automatically cleaned up in Close(). Should not be called if + /// the client is already open. + /// The ExecNode must return the initial reservation to + /// QueryState::initial_reservations(), which is done automatically in Close() as long + /// as the initial reservation is not released before Close(). + Status claim_buffer_reservation(RuntimeState* state); + + /// Release any unused reservation in excess of the node's initial reservation. Returns + /// an error if releasing the reservation requires flushing pages to disk, and that + /// fails. + Status release_unused_reservation(); + + /// Enable the increase reservation denial probability on 'buffer_pool_client_' based on + /// the 'debug_action_' set on this node. Returns an error if 'debug_action_param_' is + /// invalid. + //Status enable_deny_reservation_debug_action(); + /// Extends blocking queue for row batches. Row batches have a property that /// they must be processed in the order they were produced, even in cancellation /// paths. Preceding row batches can contain ptrs to memory in subsequent row batches @@ -261,6 +300,9 @@ protected: std::vector _children; RowDescriptor _row_descriptor; + /// Resource information sent from the frontend. + const TBackendResourceProfile _resource_profile; + // debug-only: if _debug_action is not INVALID, node will perform action in // _debug_phase TExecNodePhase::type _debug_phase; @@ -270,8 +312,17 @@ protected: int64_t _num_rows_returned; boost::scoped_ptr _runtime_profile; + + /// Account for peak memory used by this node boost::scoped_ptr _mem_tracker; + + /// MemTracker used by 'expr_mem_pool_'. boost::scoped_ptr _expr_mem_tracker; + + /// MemPool for allocating data structures used by expression evaluators in this node. + /// Created in Prepare(). + boost::scoped_ptr _expr_mem_pool; + RuntimeProfile::Counter* _rows_returned_counter; RuntimeProfile::Counter* _rows_returned_rate; // Account for peak memory used by this node @@ -282,6 +333,12 @@ protected: // "Codegen Enabled" boost::mutex _exec_options_lock; std::string _runtime_exec_options; + + /// Buffer pool client for this node. Initialized with the node's minimum reservation + /// in ClaimBufferReservation(). After initialization, the client must hold onto at + /// least the minimum reservation so that it can be returned to the initial + /// reservations pool in Close(). + BufferPool::ClientHandle _buffer_pool_client; ExecNode* child(int i) { return _children[i]; @@ -301,10 +358,10 @@ protected: bool is_in_subplan() const { return false; } // Create a single exec node derived from thrift node; place exec node in 'pool'. - static Status create_node(ObjectPool* pool, const TPlanNode& tnode, + static Status create_node(RuntimeState* state, ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs, ExecNode** node); - static Status create_tree_helper(ObjectPool* pool, const std::vector& tnodes, + static Status create_tree_helper(RuntimeState* state, ObjectPool* pool, const std::vector& tnodes, const DescriptorTbl& descs, ExecNode* parent, int* node_idx, ExecNode** root); virtual bool is_scan_node() const { @@ -319,6 +376,16 @@ protected: // Appends option to '_runtime_exec_options' void add_runtime_exec_option(const std::string& option); + + /// Frees any local allocations made by evals_to_free_ and returns the result of + /// state->CheckQueryState(). Nodes should call this periodically, e.g. once per input + /// row batch. This should not be called outside the main execution thread. + // + /// Nodes may override this to add extra periodic cleanup, e.g. freeing other local + /// allocations. ExecNodes overriding this function should return + /// ExecNode::QueryMaintenance(). + virtual Status QueryMaintenance(RuntimeState* state) WARN_UNUSED_RESULT; + private: bool _is_closed; }; diff --git a/be/src/exec/hash_join_node.cpp b/be/src/exec/hash_join_node.cpp index 167e4101ce..8ef3d1a9c2 100644 --- a/be/src/exec/hash_join_node.cpp +++ b/be/src/exec/hash_join_node.cpp @@ -63,8 +63,8 @@ HashJoinNode::~HashJoinNode() { DCHECK(_probe_batch == NULL); } -Status HashJoinNode::init(const TPlanNode& tnode) { - RETURN_IF_ERROR(ExecNode::init(tnode)); +Status HashJoinNode::init(const TPlanNode& tnode, RuntimeState* state) { + RETURN_IF_ERROR(ExecNode::init(tnode, state)); DCHECK(tnode.__isset.hash_join_node); const vector& eq_join_conjuncts = tnode.hash_join_node.eq_join_conjuncts; diff --git a/be/src/exec/hash_join_node.h b/be/src/exec/hash_join_node.h index be9bb4cfdf..ef1fdfaa8a 100644 --- a/be/src/exec/hash_join_node.h +++ b/be/src/exec/hash_join_node.h @@ -55,7 +55,7 @@ public: ~HashJoinNode(); // set up _build- and _probe_exprs - virtual Status init(const TPlanNode& tnode); + virtual Status init(const TPlanNode& tnode, RuntimeState* state = nullptr); virtual Status prepare(RuntimeState* state); virtual Status open(RuntimeState* state); virtual Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos); diff --git a/be/src/exec/hash_table.cpp b/be/src/exec/hash_table.cpp index 4d8a3d9b02..2686962fcb 100644 --- a/be/src/exec/hash_table.cpp +++ b/be/src/exec/hash_table.cpp @@ -80,10 +80,12 @@ HashTable::HashTable(const vector& build_expr_ctxs, _nodes = reinterpret_cast(malloc(_nodes_capacity * _node_byte_size)); memset(_nodes, 0, _nodes_capacity * _node_byte_size); +#if 0 if (PaloMetrics::hash_table_total_bytes() != NULL) { PaloMetrics::hash_table_total_bytes()->increment(_nodes_capacity * _node_byte_size); } - +#endif + _mem_tracker->consume(_nodes_capacity * _node_byte_size); if (_mem_tracker->limit_exceeded()) { mem_limit_exceeded(_nodes_capacity * _node_byte_size); @@ -98,10 +100,11 @@ void HashTable::close() { delete[] _expr_values_buffer; delete[] _expr_value_null_bits; free(_nodes); - +#if 0 if (PaloMetrics::hash_table_total_bytes() != NULL) { PaloMetrics::hash_table_total_bytes()->increment(-_nodes_capacity * _node_byte_size); } +#endif _mem_tracker->release(_nodes_capacity * _node_byte_size); _mem_tracker->release(_buckets.size() * sizeof(Bucket)); } @@ -269,10 +272,12 @@ void HashTable::grow_node_array() { free(_nodes); _nodes = new_nodes; +#if 0 if (PaloMetrics::hash_table_total_bytes() != NULL) { PaloMetrics::hash_table_total_bytes()->increment(new_size - old_size); } - +#endif + _mem_tracker->consume(new_size - old_size); if (_mem_tracker->limit_exceeded()) { mem_limit_exceeded(new_size - old_size); diff --git a/be/src/exec/merge_join_node.cpp b/be/src/exec/merge_join_node.cpp index d696fc4bf4..1ce0b0fa3e 100644 --- a/be/src/exec/merge_join_node.cpp +++ b/be/src/exec/merge_join_node.cpp @@ -57,9 +57,9 @@ MergeJoinNode::MergeJoinNode( MergeJoinNode::~MergeJoinNode() { } -Status MergeJoinNode::init(const TPlanNode& tnode) { +Status MergeJoinNode::init(const TPlanNode& tnode, RuntimeState* state) { DCHECK(tnode.__isset.merge_join_node); - RETURN_IF_ERROR(ExecNode::init(tnode)); + RETURN_IF_ERROR(ExecNode::init(tnode, state)); const vector& cmp_conjuncts = tnode.merge_join_node.cmp_conjuncts; diff --git a/be/src/exec/merge_join_node.h b/be/src/exec/merge_join_node.h index ab2ced4750..ebcf037846 100644 --- a/be/src/exec/merge_join_node.h +++ b/be/src/exec/merge_join_node.h @@ -43,7 +43,7 @@ public: ~MergeJoinNode(); - virtual Status init(const TPlanNode& tnode); + virtual Status init(const TPlanNode& tnode, RuntimeState* state = nullptr); virtual Status prepare(RuntimeState* state); virtual Status open(RuntimeState* state); virtual Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos); diff --git a/be/src/exec/merge_node.cpp b/be/src/exec/merge_node.cpp index ca8e04ed7e..5734f77daf 100644 --- a/be/src/exec/merge_node.cpp +++ b/be/src/exec/merge_node.cpp @@ -41,8 +41,8 @@ MergeNode::MergeNode(ObjectPool* pool, const TPlanNode& tnode, _child_row_idx(0) { } -Status MergeNode::init(const TPlanNode& tnode) { - RETURN_IF_ERROR(ExecNode::init(tnode)); +Status MergeNode::init(const TPlanNode& tnode, RuntimeState* state) { + RETURN_IF_ERROR(ExecNode::init(tnode, state)); DCHECK(tnode.__isset.merge_node); // Create _const_expr_lists from thrift exprs. const vector >& const_texpr_lists = tnode.merge_node.const_expr_lists; diff --git a/be/src/exec/merge_node.h b/be/src/exec/merge_node.h index afb3f891e7..748fb2e706 100644 --- a/be/src/exec/merge_node.h +++ b/be/src/exec/merge_node.h @@ -42,7 +42,7 @@ public: virtual ~MergeNode() { } // Create const exprs, child exprs and conjuncts from corresponding thrift exprs. - virtual Status init(const TPlanNode& tnode); + virtual Status init(const TPlanNode& tnode, RuntimeState* state = nullptr); virtual Status prepare(RuntimeState* state); virtual Status open(RuntimeState* state); virtual Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos); diff --git a/be/src/exec/mysql_scan_node.cpp b/be/src/exec/mysql_scan_node.cpp index 121ab3477b..fbb6a808ca 100644 --- a/be/src/exec/mysql_scan_node.cpp +++ b/be/src/exec/mysql_scan_node.cpp @@ -35,7 +35,10 @@ MysqlScanNode::MysqlScanNode(ObjectPool* pool, const TPlanNode& tnode, _tuple_id(tnode.mysql_scan_node.tuple_id), _columns(tnode.mysql_scan_node.columns), _filters(tnode.mysql_scan_node.filters), - _tuple_desc(nullptr) { + _tuple_desc(NULL) { + //_tuple_pool(NULL), + //_mysql_scanner(NULL) { + //_text_converter(NULL) { } MysqlScanNode::~MysqlScanNode() { diff --git a/be/src/exec/mysql_scan_node.h b/be/src/exec/mysql_scan_node.h index b608d23c26..8f0ec7bec2 100644 --- a/be/src/exec/mysql_scan_node.h +++ b/be/src/exec/mysql_scan_node.h @@ -86,7 +86,7 @@ private: // Helper class for converting text to other types; std::unique_ptr _text_converter; // Current tuple. - Tuple* _tuple = nullptr; + Tuple* _tuple; }; } diff --git a/be/src/exec/new_partitioned_aggregation_node.cc b/be/src/exec/new_partitioned_aggregation_node.cc new file mode 100644 index 0000000000..3f65732f1c --- /dev/null +++ b/be/src/exec/new_partitioned_aggregation_node.cc @@ -0,0 +1,2073 @@ +// Modifications copyright (C) 2017, Baidu.com, Inc. +// Copyright 2017 The Apache Software Foundation + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "exec/new_partitioned_aggregation_node.h" + +#include +#include +#include +#include + +//#include "codegen/codegen_anyval.h" +//#include "codegen/llvm_codegen.h" +#include "exec/new_partitioned_hash_table.h" +#include "exec/new_partitioned_hash_table.inline.h" +#include "exprs/new_agg_fn_evaluator.h" +#include "exprs/anyval_util.h" +#include "exprs/expr_context.h" +// #include "exprs/scalar_expr_evaluator.h" +#include "exprs/slot_ref.h" +#include "gutil/strings/substitute.h" +#include "runtime/buffered_tuple_stream3.inline.h" +#include "runtime/descriptors.h" +#include "runtime/exec_env.h" +#include "runtime/mem_pool.h" +#include "runtime/mem_tracker.h" +#include "runtime/raw_value.h" +#include "runtime/row_batch.h" +#include "runtime/runtime_state.h" +#include "runtime/string_value.h" +#include "runtime/tuple_row.h" +#include "runtime/tuple.h" +#include "udf/udf_internal.h" +#include "util/debug_util.h" +#include "util/runtime_profile.h" + +#include "gen_cpp/Exprs_types.h" +#include "gen_cpp/PlanNodes_types.h" + +#include "common/names.h" + +// using namespace llvm; +using namespace strings; + +namespace palo { + +const char* NewPartitionedAggregationNode::LLVM_CLASS_NAME = + "class.palo::NewPartitionedAggregationNode"; + +/// The minimum reduction factor (input rows divided by output rows) to grow hash tables +/// in a streaming preaggregation, given that the hash tables are currently the given +/// size or above. The sizes roughly correspond to hash table sizes where the bucket +/// arrays will fit in a cache level. Intuitively, we don't want the working set of the +/// aggregation to expand to the next level of cache unless we're reducing the input +/// enough to outweigh the increased memory latency we'll incur for each hash table +/// lookup. +/// +/// Note that the current reduction achieved is not always a good estimate of the +/// final reduction. It may be biased either way depending on the ordering of the +/// input. If the input order is random, we will underestimate the final reduction +/// factor because the probability of a row having the same key as a previous row +/// increases as more input is processed. If the input order is correlated with the +/// key, skew may bias the estimate. If high cardinality keys appear first, we +/// may overestimate and if low cardinality keys appear first, we underestimate. +/// To estimate the eventual reduction achieved, we estimate the final reduction +/// using the planner's estimated input cardinality and the assumption that input +/// is in a random order. This means that we assume that the reduction factor will +/// increase over time. +struct StreamingHtMinReductionEntry { + // Use 'streaming_ht_min_reduction' if the total size of hash table bucket directories in + // bytes is greater than this threshold. + int min_ht_mem; + // The minimum reduction factor to expand the hash tables. + double streaming_ht_min_reduction; +}; + +// TODO: experimentally tune these values and also programmatically get the cache size +// of the machine that we're running on. +static const StreamingHtMinReductionEntry STREAMING_HT_MIN_REDUCTION[] = { + // Expand up to L2 cache always. + {0, 0.0}, + // Expand into L3 cache if we look like we're getting some reduction. + {256 * 1024, 1.1}, + // Expand into main memory if we're getting a significant reduction. + {2 * 1024 * 1024, 2.0}, +}; + +static const int STREAMING_HT_MIN_REDUCTION_SIZE = + sizeof(STREAMING_HT_MIN_REDUCTION) / sizeof(STREAMING_HT_MIN_REDUCTION[0]); + +NewPartitionedAggregationNode::NewPartitionedAggregationNode( + ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs) + : ExecNode(pool, tnode, descs), + intermediate_tuple_id_(tnode.agg_node.intermediate_tuple_id), + intermediate_tuple_desc_(descs.get_tuple_descriptor(intermediate_tuple_id_)), + intermediate_row_desc_(intermediate_tuple_desc_, false), + output_tuple_id_(tnode.agg_node.output_tuple_id), + output_tuple_desc_(descs.get_tuple_descriptor(output_tuple_id_)), + needs_finalize_(tnode.agg_node.need_finalize), + needs_serialize_(false), + output_partition_(NULL), + process_batch_no_grouping_fn_(NULL), + process_batch_fn_(NULL), + process_batch_streaming_fn_(NULL), + build_timer_(NULL), + ht_resize_timer_(NULL), + get_results_timer_(NULL), + num_hash_buckets_(NULL), + partitions_created_(NULL), + max_partition_level_(NULL), + num_row_repartitioned_(NULL), + num_repartitions_(NULL), + num_spilled_partitions_(NULL), + largest_partition_percent_(NULL), + streaming_timer_(NULL), + num_passthrough_rows_(NULL), + preagg_estimated_reduction_(NULL), + preagg_streaming_ht_min_reduction_(NULL), +// estimated_input_cardinality_(tnode.agg_node.estimated_input_cardinality), + singleton_output_tuple_(NULL), + singleton_output_tuple_returned_(true), + partition_eos_(false), + child_eos_(false), + partition_pool_(new ObjectPool()) { + + DCHECK_EQ(PARTITION_FANOUT, 1 << NUM_PARTITIONING_BITS); + + if (tnode.agg_node.__isset.use_streaming_preaggregation) { + is_streaming_preagg_ = tnode.agg_node.use_streaming_preaggregation; + if (is_streaming_preagg_) { + DCHECK(_conjunct_ctxs.empty()) << "Preaggs have no conjuncts"; + DCHECK(!tnode.agg_node.grouping_exprs.empty()) << "Streaming preaggs do grouping"; + DCHECK(_limit == -1) << "Preaggs have no limits"; + } + } else { + is_streaming_preagg_ = false; + } +} + +Status NewPartitionedAggregationNode::init(const TPlanNode& tnode, RuntimeState* state) { + RETURN_IF_ERROR(ExecNode::init(tnode)); + DCHECK(intermediate_tuple_desc_ != nullptr); + DCHECK(output_tuple_desc_ != nullptr); + DCHECK_EQ(intermediate_tuple_desc_->slots().size(), output_tuple_desc_->slots().size()); + + const RowDescriptor& row_desc = child(0)->row_desc(); + RETURN_IF_ERROR(Expr::create(tnode.agg_node.grouping_exprs, row_desc, + state, &grouping_exprs_, mem_tracker())); + // Construct build exprs from intermediate_row_desc_ + for (int i = 0; i < grouping_exprs_.size(); ++i) { + SlotDescriptor* desc = intermediate_tuple_desc_->slots()[i]; + DCHECK(desc->type().type == TYPE_NULL || desc->type() == grouping_exprs_[i]->type()); + // Hack to avoid TYPE_NULL SlotRefs. + SlotRef* build_expr = _pool->add(desc->type().type != TYPE_NULL ? + new SlotRef(desc) : new SlotRef(desc, TYPE_BOOLEAN)); + build_exprs_.push_back(build_expr); + // TODO chenhao + RETURN_IF_ERROR(build_expr->prepare(state, intermediate_row_desc_, nullptr)); + if (build_expr->type().is_var_len_string_type()) string_grouping_exprs_.push_back(i); + } + + int j = grouping_exprs_.size(); + for (int i = 0; i < tnode.agg_node.aggregate_functions.size(); ++i, ++j) { + SlotDescriptor* intermediate_slot_desc = intermediate_tuple_desc_->slots()[j]; + SlotDescriptor* output_slot_desc = output_tuple_desc_->slots()[j]; + AggFn* agg_fn; + RETURN_IF_ERROR(AggFn::Create(tnode.agg_node.aggregate_functions[i], row_desc, + *intermediate_slot_desc, *output_slot_desc, state, &agg_fn)); + agg_fns_.push_back(agg_fn); + needs_serialize_ |= agg_fn->SupportsSerialize(); + } + return Status::OK; +} + +Status NewPartitionedAggregationNode::prepare(RuntimeState* state) { + SCOPED_TIMER(_runtime_profile->total_time_counter()); + + RETURN_IF_ERROR(ExecNode::prepare(state)); + state_ = state; + + mem_pool_.reset(new MemPool(mem_tracker())); + agg_fn_pool_.reset(new MemPool(expr_mem_tracker())); + + ht_resize_timer_ = ADD_TIMER(runtime_profile(), "HTResizeTime"); + get_results_timer_ = ADD_TIMER(runtime_profile(), "GetResultsTime"); + num_hash_buckets_ = + ADD_COUNTER(runtime_profile(), "HashBuckets", TUnit::UNIT); + partitions_created_ = + ADD_COUNTER(runtime_profile(), "PartitionsCreated", TUnit::UNIT); + largest_partition_percent_ = + runtime_profile()->AddHighWaterMarkCounter("LargestPartitionPercent", TUnit::UNIT); + if (is_streaming_preagg_) { + runtime_profile()->append_exec_option("Streaming Preaggregation"); + streaming_timer_ = ADD_TIMER(runtime_profile(), "StreamingTime"); + num_passthrough_rows_ = + ADD_COUNTER(runtime_profile(), "RowsPassedThrough", TUnit::UNIT); + preagg_estimated_reduction_ = ADD_COUNTER( + runtime_profile(), "ReductionFactorEstimate", TUnit::DOUBLE_VALUE); + preagg_streaming_ht_min_reduction_ = ADD_COUNTER( + runtime_profile(), "ReductionFactorThresholdToExpand", TUnit::DOUBLE_VALUE); + } else { + build_timer_ = ADD_TIMER(runtime_profile(), "BuildTime"); + num_row_repartitioned_ = + ADD_COUNTER(runtime_profile(), "RowsRepartitioned", TUnit::UNIT); + num_repartitions_ = + ADD_COUNTER(runtime_profile(), "NumRepartitions", TUnit::UNIT); + num_spilled_partitions_ = + ADD_COUNTER(runtime_profile(), "SpilledPartitions", TUnit::UNIT); + max_partition_level_ = runtime_profile()->AddHighWaterMarkCounter( + "MaxPartitionLevel", TUnit::UNIT); + } + // TODO chenhao + const RowDescriptor& row_desc = child(0)->row_desc(); + RETURN_IF_ERROR(NewAggFnEvaluator::Create(agg_fns_, state, _pool, agg_fn_pool_.get(), + &agg_fn_evals_, expr_mem_tracker(), row_desc)); + + if (!grouping_exprs_.empty()) { + RowDescriptor build_row_desc(intermediate_tuple_desc_, false); + RETURN_IF_ERROR(NewPartitionedHashTableCtx::Create(_pool, state, build_exprs_, + grouping_exprs_, true, vector(build_exprs_.size(), true), + state->fragment_hash_seed(), MAX_PARTITION_DEPTH, 1, expr_mem_pool(), + expr_mem_tracker(), build_row_desc, row_desc, &ht_ctx_)); + } + // AddCodegenDisabledMessage(state); + return Status::OK; +} + +//void NewPartitionedAggregationNode::Codegen(RuntimeState* state) { +// DCHECK(state->ShouldCodegen()); +// ExecNode::Codegen(state); +// if (IsNodeCodegenDisabled()) return; +// +// LlvmCodeGen* codegen = state->codegen(); +// DCHECK(codegen != NULL); +// TPrefetchMode::type prefetch_mode = state_->query_options().prefetch_mode; +// Status codegen_status = is_streaming_preagg_ ? +// CodegenProcessBatchStreaming(codegen, prefetch_mode) : +// CodegenProcessBatch(codegen, prefetch_mode); +// runtime_profile()->AddCodegenMsg(codegen_status.ok(), codegen_status); +//} + +Status NewPartitionedAggregationNode::open(RuntimeState* state) { + SCOPED_TIMER(_runtime_profile->total_time_counter()); + // Open the child before consuming resources in this node. + RETURN_IF_ERROR(child(0)->open(state)); + RETURN_IF_ERROR(ExecNode::open(state)); + + // Claim reservation after the child has been opened to reduce the peak reservation + // requirement. + if (!_buffer_pool_client.is_registered() && !grouping_exprs_.empty()) { + DCHECK_GE(_resource_profile.min_reservation, MinReservation()); + RETURN_IF_ERROR(claim_buffer_reservation(state)); + } + + if (ht_ctx_.get() != nullptr) RETURN_IF_ERROR(ht_ctx_->Open(state)); + RETURN_IF_ERROR(NewAggFnEvaluator::Open(agg_fn_evals_, state)); + if (grouping_exprs_.empty()) { + // Create the single output tuple for this non-grouping agg. This must happen after + // opening the aggregate evaluators. + singleton_output_tuple_ = + ConstructSingletonOutputTuple(agg_fn_evals_, mem_pool_.get()); + // Check for failures during NewAggFnEvaluator::Init(). + RETURN_IF_ERROR(state_->query_status()); + singleton_output_tuple_returned_ = false; + } else { + if (ht_allocator_ == nullptr) { + // Allocate 'serialize_stream_' and 'ht_allocator_' on the first Open() call. + ht_allocator_.reset(new Suballocator(state_->exec_env()->buffer_pool(), + &_buffer_pool_client, _resource_profile.spillable_buffer_size)); + + if (!is_streaming_preagg_ && needs_serialize_) { + serialize_stream_.reset(new BufferedTupleStream3(state, &intermediate_row_desc_, + &_buffer_pool_client, _resource_profile.spillable_buffer_size, + _resource_profile.max_row_buffer_size)); + RETURN_IF_ERROR(serialize_stream_->Init(id(), false)); + bool got_buffer; + // Reserve the memory for 'serialize_stream_' so we don't need to scrounge up + // another buffer during spilling. + RETURN_IF_ERROR(serialize_stream_->PrepareForWrite(&got_buffer)); + DCHECK(got_buffer) + << "Accounted in min reservation" << _buffer_pool_client.DebugString(); + DCHECK(serialize_stream_->has_write_iterator()); + } + } + RETURN_IF_ERROR(CreateHashPartitions(0)); + } + + // Streaming preaggregations do all processing in GetNext(). + if (is_streaming_preagg_) return Status::OK; + + RowBatch batch(child(0)->row_desc(), state->batch_size(), mem_tracker()); + // Read all the rows from the child and process them. + bool eos = false; + do { + RETURN_IF_CANCELLED(state); + RETURN_IF_ERROR(QueryMaintenance(state)); + RETURN_IF_ERROR(_children[0]->get_next(state, &batch, &eos)); + + if (UNLIKELY(VLOG_ROW_IS_ON)) { + for (int i = 0; i < batch.num_rows(); ++i) { + TupleRow* row = batch.get_row(i); + VLOG_ROW << "input row: " << print_row(row, _children[0]->row_desc()); + } + } + + SCOPED_TIMER(build_timer_); + if (grouping_exprs_.empty()) { + if (process_batch_no_grouping_fn_ != NULL) { + RETURN_IF_ERROR(process_batch_no_grouping_fn_(this, &batch)); + } else { + RETURN_IF_ERROR(ProcessBatchNoGrouping(&batch)); + } + } else { + // There is grouping, so we will do partitioned aggregation. + if (process_batch_fn_ != NULL) { + RETURN_IF_ERROR(process_batch_fn_(this, &batch, ht_ctx_.get())); + } else { + RETURN_IF_ERROR(ProcessBatch(&batch, ht_ctx_.get())); + } + } + batch.reset(); + } while (!eos); + + // The child can be closed at this point in most cases because we have consumed all of + // the input from the child and transfered ownership of the resources we need. The + // exception is if we are inside a subplan expecting to call Open()/GetNext() on the + // child again, + if (!is_in_subplan()) child(0)->close(state); + child_eos_ = true; + + // Done consuming child(0)'s input. Move all the partitions in hash_partitions_ + // to spilled_partitions_ or aggregated_partitions_. We'll finish the processing in + // GetNext(). + if (!grouping_exprs_.empty()) { + RETURN_IF_ERROR(MoveHashPartitions(child(0)->rows_returned())); + } + return Status::OK; +} + +Status NewPartitionedAggregationNode::get_next(RuntimeState* state, RowBatch* row_batch, + bool* eos) { + int first_row_idx = row_batch->num_rows(); + RETURN_IF_ERROR(GetNextInternal(state, row_batch, eos)); + RETURN_IF_ERROR(HandleOutputStrings(row_batch, first_row_idx)); + return Status::OK; +} + +Status NewPartitionedAggregationNode::HandleOutputStrings(RowBatch* row_batch, + int first_row_idx) { + if (!needs_finalize_ && !needs_serialize_) return Status::OK; + // String data returned by Serialize() or Finalize() is from local expr allocations in + // the agg function contexts, and will be freed on the next GetNext() call by + // FreeLocalAllocations(). The data either needs to be copied out now or sent up the + // plan and copied out by a blocking ancestor. (See IMPALA-3311) + for (const AggFn* agg_fn : agg_fns_) { + const SlotDescriptor& slot_desc = agg_fn->output_slot_desc(); + DCHECK(!slot_desc.type().is_collection_type()) << "producing collections NYI"; + if (!slot_desc.type().is_var_len_string_type()) continue; + if (is_in_subplan()) { + // Copy string data to the row batch's pool. This is more efficient than + // MarkNeedsDeepCopy() in a subplan since we are likely producing many small + // batches. + RETURN_IF_ERROR(CopyStringData(slot_desc, row_batch, + first_row_idx, row_batch->tuple_data_pool())); + } else { + row_batch->mark_needs_deep_copy(); + break; + } + } + return Status::OK; +} + +Status NewPartitionedAggregationNode::CopyStringData(const SlotDescriptor& slot_desc, + RowBatch* row_batch, int first_row_idx, MemPool* pool) { + DCHECK(slot_desc.type().is_var_len_string_type()); + DCHECK_EQ(row_batch->row_desc().tuple_descriptors().size(), 1); + FOREACH_ROW(row_batch, first_row_idx, batch_iter) { + Tuple* tuple = batch_iter.get()->get_tuple(0); + StringValue* sv = reinterpret_cast( + tuple->get_slot(slot_desc.tuple_offset())); + if (sv == NULL || sv->len == 0) continue; + char* new_ptr = reinterpret_cast(pool->try_allocate(sv->len)); + if (UNLIKELY(new_ptr == NULL)) { + string details = Substitute("Cannot perform aggregation at node with id $0." + " Failed to allocate $1 output bytes.", _id, sv->len); + return pool->mem_tracker()->MemLimitExceeded(state_, details, sv->len); + } + memcpy(new_ptr, sv->ptr, sv->len); + sv->ptr = new_ptr; + } + return Status::OK; +} + +Status NewPartitionedAggregationNode::GetNextInternal(RuntimeState* state, + RowBatch* row_batch, bool* eos) { + SCOPED_TIMER(_runtime_profile->total_time_counter()); + RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); + RETURN_IF_CANCELLED(state); + RETURN_IF_ERROR(state->check_query_state()); + + if (reached_limit()) { + *eos = true; + return Status::OK; + } + + if (grouping_exprs_.empty()) { + // There was no grouping, so evaluate the conjuncts and return the single result row. + // We allow calling GetNext() after eos, so don't return this row again. + if (!singleton_output_tuple_returned_) GetSingletonOutput(row_batch); + singleton_output_tuple_returned_ = true; + *eos = true; + return Status::OK; + } + + if (!child_eos_) { + // For streaming preaggregations, we process rows from the child as we go. + DCHECK(is_streaming_preagg_); + RETURN_IF_ERROR(GetRowsStreaming(state, row_batch)); + } else if (!partition_eos_) { + RETURN_IF_ERROR(GetRowsFromPartition(state, row_batch)); + } + + *eos = partition_eos_ && child_eos_; + COUNTER_SET(_rows_returned_counter, _num_rows_returned); + return Status::OK; +} + +void NewPartitionedAggregationNode::GetSingletonOutput(RowBatch* row_batch) { + DCHECK(grouping_exprs_.empty()); + int row_idx = row_batch->add_row(); + TupleRow* row = row_batch->get_row(row_idx); + Tuple* output_tuple = GetOutputTuple(agg_fn_evals_, + singleton_output_tuple_, row_batch->tuple_data_pool()); + row->set_tuple(0, output_tuple); + if (ExecNode::eval_conjuncts( + _conjunct_ctxs.data(), _conjunct_ctxs.size(), row)) { + row_batch->commit_last_row(); + ++_num_rows_returned; + COUNTER_SET(_rows_returned_counter, _num_rows_returned); + } + // Keep the current chunk to amortize the memory allocation over a series + // of Reset()/Open()/GetNext()* calls. + row_batch->tuple_data_pool()->acquire_data(mem_pool_.get(), true); + // This node no longer owns the memory for singleton_output_tuple_. + singleton_output_tuple_ = NULL; +} + +Status NewPartitionedAggregationNode::GetRowsFromPartition(RuntimeState* state, + RowBatch* row_batch) { + DCHECK(!row_batch->at_capacity()); + if (output_iterator_.AtEnd()) { + // Done with this partition, move onto the next one. + if (output_partition_ != NULL) { + output_partition_->Close(false); + output_partition_ = NULL; + } + if (aggregated_partitions_.empty() && spilled_partitions_.empty()) { + // No more partitions, all done. + partition_eos_ = true; + return Status::OK; + } + // Process next partition. + RETURN_IF_ERROR(NextPartition()); + DCHECK(output_partition_ != NULL); + } + + SCOPED_TIMER(get_results_timer_); + int count = 0; + const int N = BitUtil::next_power_of_two(state->batch_size()); + // Keeping returning rows from the current partition. + while (!output_iterator_.AtEnd()) { + // This loop can go on for a long time if the conjuncts are very selective. Do query + // maintenance every N iterations. + if ((count++ & (N - 1)) == 0) { + RETURN_IF_CANCELLED(state); + RETURN_IF_ERROR(QueryMaintenance(state)); + } + + int row_idx = row_batch->add_row(); + TupleRow* row = row_batch->get_row(row_idx); + Tuple* intermediate_tuple = output_iterator_.GetTuple(); + Tuple* output_tuple = GetOutputTuple( + output_partition_->agg_fn_evals, intermediate_tuple, row_batch->tuple_data_pool()); + output_iterator_.Next(); + row->set_tuple(0, output_tuple); + // TODO chenhao + // DCHECK_EQ(_conjunct_ctxs.size(), _conjuncts.size()); + if (ExecNode::eval_conjuncts(_conjunct_ctxs.data(), _conjunct_ctxs.size(), row)) { + row_batch->commit_last_row(); + ++_num_rows_returned; + if (reached_limit() || row_batch->at_capacity()) { + break; + } + } + } + + COUNTER_SET(_rows_returned_counter, _num_rows_returned); + partition_eos_ = reached_limit(); + if (output_iterator_.AtEnd()) row_batch->mark_needs_deep_copy(); + + return Status::OK; +} + +Status NewPartitionedAggregationNode::GetRowsStreaming(RuntimeState* state, + RowBatch* out_batch) { + DCHECK(!child_eos_); + DCHECK(is_streaming_preagg_); + + if (child_batch_ == NULL) { + child_batch_.reset(new RowBatch(child(0)->row_desc(), state->batch_size(), + mem_tracker())); + } + + do { + DCHECK_EQ(out_batch->num_rows(), 0); + RETURN_IF_CANCELLED(state); + RETURN_IF_ERROR(QueryMaintenance(state)); + + RETURN_IF_ERROR(child(0)->get_next(state, child_batch_.get(), &child_eos_)); + + SCOPED_TIMER(streaming_timer_); + + int remaining_capacity[PARTITION_FANOUT]; + bool ht_needs_expansion = false; + for (int i = 0; i < PARTITION_FANOUT; ++i) { + NewPartitionedHashTable* hash_tbl = GetHashTable(i); + remaining_capacity[i] = hash_tbl->NumInsertsBeforeResize(); + ht_needs_expansion |= remaining_capacity[i] < child_batch_->num_rows(); + } + + // Stop expanding hash tables if we're not reducing the input sufficiently. As our + // hash tables expand out of each level of cache hierarchy, every hash table lookup + // will take longer. We also may not be able to expand hash tables because of memory + // pressure. In this case HashTable::CheckAndResize() will fail. In either case we + // should always use the remaining space in the hash table to avoid wasting memory. + if (ht_needs_expansion && ShouldExpandPreaggHashTables()) { + for (int i = 0; i < PARTITION_FANOUT; ++i) { + NewPartitionedHashTable* ht = GetHashTable(i); + if (remaining_capacity[i] < child_batch_->num_rows()) { + SCOPED_TIMER(ht_resize_timer_); + bool resized; + RETURN_IF_ERROR( + ht->CheckAndResize(child_batch_->num_rows(), ht_ctx_.get(), &resized)); + if (resized) { + remaining_capacity[i] = ht->NumInsertsBeforeResize(); + } + } + } + } + + if (process_batch_streaming_fn_ != NULL) { + RETURN_IF_ERROR(process_batch_streaming_fn_(this, needs_serialize_, + child_batch_.get(), out_batch, ht_ctx_.get(), remaining_capacity)); + } else { + RETURN_IF_ERROR(ProcessBatchStreaming(needs_serialize_, + child_batch_.get(), out_batch, ht_ctx_.get(), remaining_capacity)); + } + + child_batch_->reset(); // All rows from child_batch_ were processed. + } while (out_batch->num_rows() == 0 && !child_eos_); + + if (child_eos_) { + child(0)->close(state); + child_batch_.reset(); + RETURN_IF_ERROR(MoveHashPartitions(child(0)->rows_returned())); + } + + _num_rows_returned += out_batch->num_rows(); + COUNTER_SET(num_passthrough_rows_, _num_rows_returned); + return Status::OK; +} + +bool NewPartitionedAggregationNode::ShouldExpandPreaggHashTables() const { + int64_t ht_mem = 0; + int64_t ht_rows = 0; + for (int i = 0; i < PARTITION_FANOUT; ++i) { + NewPartitionedHashTable* ht = hash_partitions_[i]->hash_tbl.get(); + ht_mem += ht->CurrentMemSize(); + ht_rows += ht->size(); + } + + // Need some rows in tables to have valid statistics. + if (ht_rows == 0) return true; + + // Find the appropriate reduction factor in our table for the current hash table sizes. + int cache_level = 0; + while (cache_level + 1 < STREAMING_HT_MIN_REDUCTION_SIZE && + ht_mem >= STREAMING_HT_MIN_REDUCTION[cache_level + 1].min_ht_mem) { + ++cache_level; + } + + // Compare the number of rows in the hash table with the number of input rows that + // were aggregated into it. Exclude passed through rows from this calculation since + // they were not in hash tables. + const int64_t input_rows = _children[0]->rows_returned(); + const int64_t aggregated_input_rows = input_rows - _num_rows_returned; + // TODO chenhao +// const int64_t expected_input_rows = estimated_input_cardinality_ - num_rows_returned_; + double current_reduction = static_cast(aggregated_input_rows) / ht_rows; + + // TODO: workaround for IMPALA-2490: subplan node rows_returned counter may be + // inaccurate, which could lead to a divide by zero below. + if (aggregated_input_rows <= 0) return true; + + // Extrapolate the current reduction factor (r) using the formula + // R = 1 + (N / n) * (r - 1), where R is the reduction factor over the full input data + // set, N is the number of input rows, excluding passed-through rows, and n is the + // number of rows inserted or merged into the hash tables. This is a very rough + // approximation but is good enough to be useful. + // TODO: consider collecting more statistics to better estimate reduction. +// double estimated_reduction = aggregated_input_rows >= expected_input_rows +// ? current_reduction +// : 1 + (expected_input_rows / aggregated_input_rows) * (current_reduction - 1); + double min_reduction = + STREAMING_HT_MIN_REDUCTION[cache_level].streaming_ht_min_reduction; + +// COUNTER_SET(preagg_estimated_reduction_, estimated_reduction); + COUNTER_SET(preagg_streaming_ht_min_reduction_, min_reduction); +// return estimated_reduction > min_reduction; + return current_reduction > min_reduction; +} + +void NewPartitionedAggregationNode::CleanupHashTbl( + const vector& agg_fn_evals, NewPartitionedHashTable::Iterator it) { + if (!needs_finalize_ && !needs_serialize_) return; + + // Iterate through the remaining rows in the hash table and call Serialize/Finalize on + // them in order to free any memory allocated by UDAs. + if (needs_finalize_) { + // Finalize() requires a dst tuple but we don't actually need the result, + // so allocate a single dummy tuple to avoid accumulating memory. + Tuple* dummy_dst = NULL; + dummy_dst = Tuple::create(output_tuple_desc_->byte_size(), mem_pool_.get()); + while (!it.AtEnd()) { + Tuple* tuple = it.GetTuple(); + NewAggFnEvaluator::Finalize(agg_fn_evals, tuple, dummy_dst); + it.Next(); + } + } else { + while (!it.AtEnd()) { + Tuple* tuple = it.GetTuple(); + NewAggFnEvaluator::Serialize(agg_fn_evals, tuple); + it.Next(); + } + } +} + +Status NewPartitionedAggregationNode::reset(RuntimeState* state) { + DCHECK(!is_streaming_preagg_) << "Cannot reset preaggregation"; + if (!grouping_exprs_.empty()) { + child_eos_ = false; + partition_eos_ = false; + // Reset the HT and the partitions for this grouping agg. + ht_ctx_->set_level(0); + ClosePartitions(); + } + return ExecNode::reset(state); +} + +Status NewPartitionedAggregationNode::close(RuntimeState* state) { + if (is_closed()) return Status::OK; + + if (!singleton_output_tuple_returned_) { + GetOutputTuple(agg_fn_evals_, singleton_output_tuple_, mem_pool_.get()); + } + + // Iterate through the remaining rows in the hash table and call Serialize/Finalize on + // them in order to free any memory allocated by UDAs + if (output_partition_ != NULL) { + CleanupHashTbl(output_partition_->agg_fn_evals, output_iterator_); + output_partition_->Close(false); + } + + ClosePartitions(); + + child_batch_.reset(); + + // Close all the agg-fn-evaluators + NewAggFnEvaluator::Close(agg_fn_evals_, state); + + if (agg_fn_pool_.get() != nullptr) agg_fn_pool_->free_all(); + if (mem_pool_.get() != nullptr) mem_pool_->free_all(); + if (ht_ctx_.get() != nullptr) ht_ctx_->Close(state); + ht_ctx_.reset(); + if (serialize_stream_.get() != nullptr) { + serialize_stream_->Close(nullptr, RowBatch::FlushMode::NO_FLUSH_RESOURCES); + } + Expr::close(grouping_exprs_); + Expr::close(build_exprs_); + AggFn::Close(agg_fns_); + return ExecNode::close(state); +} + +NewPartitionedAggregationNode::Partition::~Partition() { + DCHECK(is_closed); +} + +Status NewPartitionedAggregationNode::Partition::InitStreams() { + agg_fn_pool.reset(new MemPool(parent->expr_mem_tracker())); + DCHECK_EQ(agg_fn_evals.size(), 0); + NewAggFnEvaluator::ShallowClone(parent->partition_pool_.get(), agg_fn_pool.get(), + parent->agg_fn_evals_, &agg_fn_evals); + + // Varlen aggregate function results are stored outside of aggregated_row_stream because + // BufferedTupleStream3 doesn't support relocating varlen data stored in the stream. + auto agg_slot = parent->intermediate_tuple_desc_->slots().begin() + + parent->grouping_exprs_.size(); + std::set external_varlen_slots; + for (; agg_slot != parent->intermediate_tuple_desc_->slots().end(); ++agg_slot) { + if ((*agg_slot)->type().is_var_len_string_type()) { + external_varlen_slots.insert((*agg_slot)->id()); + } + } + + aggregated_row_stream.reset(new BufferedTupleStream3(parent->state_, + &parent->intermediate_row_desc_, &parent->_buffer_pool_client, + parent->_resource_profile.spillable_buffer_size, + parent->_resource_profile.max_row_buffer_size, external_varlen_slots)); + RETURN_IF_ERROR( + aggregated_row_stream->Init(parent->id(), true)); + bool got_buffer; + RETURN_IF_ERROR(aggregated_row_stream->PrepareForWrite(&got_buffer)); + DCHECK(got_buffer) << "Buffer included in reservation " << parent->_id << "\n" + << parent->_buffer_pool_client.DebugString() << "\n" + << parent->DebugString(2); + + if (!parent->is_streaming_preagg_) { + unaggregated_row_stream.reset(new BufferedTupleStream3(parent->state_, + &(parent->child(0)->row_desc()), &parent->_buffer_pool_client, + parent->_resource_profile.spillable_buffer_size, + parent->_resource_profile.max_row_buffer_size)); + // This stream is only used to spill, no need to ever have this pinned. + RETURN_IF_ERROR(unaggregated_row_stream->Init(parent->id(), false)); + // Save memory by waiting until we spill to allocate the write buffer for the + // unaggregated row stream. + DCHECK(!unaggregated_row_stream->has_write_iterator()); + } + return Status::OK; +} + +Status NewPartitionedAggregationNode::Partition::InitHashTable(bool* got_memory) { + DCHECK(aggregated_row_stream != nullptr); + DCHECK(hash_tbl == nullptr); + // We use the upper PARTITION_FANOUT num bits to pick the partition so only the + // remaining bits can be used for the hash table. + // TODO: we could switch to 64 bit hashes and then we don't need a max size. + // It might be reasonable to limit individual hash table size for other reasons + // though. Always start with small buffers. + hash_tbl.reset(NewPartitionedHashTable::Create(parent->ht_allocator_.get(), false, 1, nullptr, + 1L << (32 - NUM_PARTITIONING_BITS), PAGG_DEFAULT_HASH_TABLE_SZ)); + // Please update the error message in CreateHashPartitions() if initial size of + // hash table changes. + return hash_tbl->Init(got_memory); +} + +Status NewPartitionedAggregationNode::Partition::SerializeStreamForSpilling() { + DCHECK(!parent->is_streaming_preagg_); + if (parent->needs_serialize_) { + // We need to do a lot more work in this case. This step effectively does a merge + // aggregation in this node. We need to serialize the intermediates, spill the + // intermediates and then feed them into the aggregate function's merge step. + // This is often used when the intermediate is a string type, meaning the current + // (before serialization) in-memory layout is not the on-disk block layout. + // The disk layout does not support mutable rows. We need to rewrite the stream + // into the on disk format. + // TODO: if it happens to not be a string, we could serialize in place. This is + // a future optimization since it is very unlikely to have a serialize phase + // for those UDAs. + DCHECK(parent->serialize_stream_.get() != NULL); + DCHECK(!parent->serialize_stream_->is_pinned()); + + // Serialize and copy the spilled partition's stream into the new stream. + Status status = Status::OK; + BufferedTupleStream3* new_stream = parent->serialize_stream_.get(); + NewPartitionedHashTable::Iterator it = hash_tbl->Begin(parent->ht_ctx_.get()); + while (!it.AtEnd()) { + Tuple* tuple = it.GetTuple(); + it.Next(); + NewAggFnEvaluator::Serialize(agg_fn_evals, tuple); + if (UNLIKELY(!new_stream->AddRow(reinterpret_cast(&tuple), &status))) { + DCHECK(!status.ok()) << "Stream was unpinned - AddRow() only fails on error"; + // Even if we can't add to new_stream, finish up processing this agg stream to make + // clean up easier (someone has to finalize this stream and we don't want to remember + // where we are). + parent->CleanupHashTbl(agg_fn_evals, it); + hash_tbl->Close(); + hash_tbl.reset(); + aggregated_row_stream->Close(NULL, RowBatch::FlushMode::NO_FLUSH_RESOURCES); + return status; + } + } + + aggregated_row_stream->Close(NULL, RowBatch::FlushMode::NO_FLUSH_RESOURCES); + aggregated_row_stream.swap(parent->serialize_stream_); + // Recreate the serialize_stream (and reserve 1 buffer) now in preparation for + // when we need to spill again. We need to have this available before we need + // to spill to make sure it is available. This should be acquirable since we just + // freed at least one buffer from this partition's (old) aggregated_row_stream. + parent->serialize_stream_.reset(new BufferedTupleStream3(parent->state_, + &parent->intermediate_row_desc_, &parent->_buffer_pool_client, + parent->_resource_profile.spillable_buffer_size, + parent->_resource_profile.max_row_buffer_size)); + status = parent->serialize_stream_->Init(parent->id(), false); + if (status.ok()) { + bool got_buffer; + status = parent->serialize_stream_->PrepareForWrite(&got_buffer); + DCHECK(!status.ok() || got_buffer) << "Accounted in min reservation"; + } + if (!status.ok()) { + hash_tbl->Close(); + hash_tbl.reset(); + return status; + } + DCHECK(parent->serialize_stream_->has_write_iterator()); + } + return Status::OK; +} + +Status NewPartitionedAggregationNode::Partition::Spill(bool more_aggregate_rows) { + DCHECK(!parent->is_streaming_preagg_); + DCHECK(!is_closed); + DCHECK(!is_spilled()); + RETURN_IF_ERROR(parent->state_->StartSpilling(parent->mem_tracker())); + + RETURN_IF_ERROR(SerializeStreamForSpilling()); + + // Free the in-memory result data. + NewAggFnEvaluator::Close(agg_fn_evals, parent->state_); + agg_fn_evals.clear(); + + if (agg_fn_pool.get() != NULL) { + agg_fn_pool->free_all(); + agg_fn_pool.reset(); + } + + hash_tbl->Close(); + hash_tbl.reset(); + + // Unpin the stream to free memory, but leave a write buffer in place so we can + // continue appending rows to one of the streams in the partition. + DCHECK(aggregated_row_stream->has_write_iterator()); + DCHECK(!unaggregated_row_stream->has_write_iterator()); + if (more_aggregate_rows) { +// aggregated_row_stream->UnpinStream(BufferedTupleStream3::UNPIN_ALL_EXCEPT_CURRENT); + } else { +// aggregated_row_stream->UnpinStream(BufferedTupleStream3::UNPIN_ALL); + bool got_buffer; + RETURN_IF_ERROR(unaggregated_row_stream->PrepareForWrite(&got_buffer)); + DCHECK(got_buffer) + << "Accounted in min reservation" << parent->_buffer_pool_client.DebugString(); + } + + COUNTER_UPDATE(parent->num_spilled_partitions_, 1); + if (parent->num_spilled_partitions_->value() == 1) { + parent->add_runtime_exec_option("Spilled"); + } + return Status::OK; +} + +void NewPartitionedAggregationNode::Partition::Close(bool finalize_rows) { + if (is_closed) return; + is_closed = true; + if (aggregated_row_stream.get() != NULL) { + if (finalize_rows && hash_tbl.get() != NULL) { + // We need to walk all the rows and Finalize them here so the UDA gets a chance + // to cleanup. If the hash table is gone (meaning this was spilled), the rows + // should have been finalized/serialized in Spill(). + parent->CleanupHashTbl(agg_fn_evals, hash_tbl->Begin(parent->ht_ctx_.get())); + } + aggregated_row_stream->Close(NULL, RowBatch::FlushMode::NO_FLUSH_RESOURCES); + } + if (hash_tbl.get() != NULL) hash_tbl->Close(); + if (unaggregated_row_stream.get() != NULL) { + unaggregated_row_stream->Close(NULL, RowBatch::FlushMode::NO_FLUSH_RESOURCES); + } + + for (NewAggFnEvaluator* eval : agg_fn_evals) eval->Close(parent->state_); + if (agg_fn_pool.get() != NULL) agg_fn_pool->free_all(); +} + +Tuple* NewPartitionedAggregationNode::ConstructSingletonOutputTuple( + const vector& agg_fn_evals, MemPool* pool) { + DCHECK(grouping_exprs_.empty()); + Tuple* output_tuple = Tuple::create(intermediate_tuple_desc_->byte_size(), pool); + InitAggSlots(agg_fn_evals, output_tuple); + return output_tuple; +} + +Tuple* NewPartitionedAggregationNode::ConstructIntermediateTuple( + const vector& agg_fn_evals, MemPool* pool, Status* status) { + const int fixed_size = intermediate_tuple_desc_->byte_size(); + const int varlen_size = GroupingExprsVarlenSize(); + const int tuple_data_size = fixed_size + varlen_size; + uint8_t* tuple_data = pool->try_allocate(tuple_data_size); + if (UNLIKELY(tuple_data == NULL)) { + string details = Substitute("Cannot perform aggregation at node with id $0. Failed " + "to allocate $1 bytes for intermediate tuple.", _id, tuple_data_size); + *status = pool->mem_tracker()->MemLimitExceeded(state_, details, tuple_data_size); + return NULL; + } + memset(tuple_data, 0, fixed_size); + Tuple* intermediate_tuple = reinterpret_cast(tuple_data); + uint8_t* varlen_data = tuple_data + fixed_size; + CopyGroupingValues(intermediate_tuple, varlen_data, varlen_size); + InitAggSlots(agg_fn_evals, intermediate_tuple); + return intermediate_tuple; +} + +Tuple* NewPartitionedAggregationNode::ConstructIntermediateTuple( + const vector& agg_fn_evals, BufferedTupleStream3* stream, + Status* status) { + DCHECK(stream != NULL && status != NULL); + // Allocate space for the entire tuple in the stream. + const int fixed_size = intermediate_tuple_desc_->byte_size(); + const int varlen_size = GroupingExprsVarlenSize(); + const int tuple_size = fixed_size + varlen_size; + uint8_t* tuple_data = stream->AddRowCustomBegin(tuple_size, status); + if (UNLIKELY(tuple_data == nullptr)) { + // If we failed to allocate and did not hit an error (indicated by a non-ok status), + // the caller of this function can try to free some space, e.g. through spilling, and + // re-attempt to allocate space for this row. + return nullptr; + } + Tuple* tuple = reinterpret_cast(tuple_data); + tuple->init(fixed_size); + uint8_t* varlen_buffer = tuple_data + fixed_size; + CopyGroupingValues(tuple, varlen_buffer, varlen_size); + InitAggSlots(agg_fn_evals, tuple); + stream->AddRowCustomEnd(tuple_size); + return tuple; +} + +int NewPartitionedAggregationNode::GroupingExprsVarlenSize() { + int varlen_size = 0; + // TODO: The hash table could compute this as it hashes. + for (int expr_idx: string_grouping_exprs_) { + StringValue* sv = reinterpret_cast(ht_ctx_->ExprValue(expr_idx)); + // Avoid branching by multiplying length by null bit. + varlen_size += sv->len * !ht_ctx_->ExprValueNull(expr_idx); + } + return varlen_size; +} + +// TODO: codegen this function. +void NewPartitionedAggregationNode::CopyGroupingValues(Tuple* intermediate_tuple, + uint8_t* buffer, int varlen_size) { + // Copy over all grouping slots (the variable length data is copied below). + for (int i = 0; i < grouping_exprs_.size(); ++i) { + SlotDescriptor* slot_desc = intermediate_tuple_desc_->slots()[i]; + if (ht_ctx_->ExprValueNull(i)) { + intermediate_tuple->set_null(slot_desc->null_indicator_offset()); + } else { + void* src = ht_ctx_->ExprValue(i); + void* dst = intermediate_tuple->get_slot(slot_desc->tuple_offset()); + memcpy(dst, src, slot_desc->slot_size()); + } + } + + for (int expr_idx: string_grouping_exprs_) { + if (ht_ctx_->ExprValueNull(expr_idx)) continue; + + SlotDescriptor* slot_desc = intermediate_tuple_desc_->slots()[expr_idx]; + // ptr and len were already copied to the fixed-len part of string value + StringValue* sv = reinterpret_cast( + intermediate_tuple->get_slot(slot_desc->tuple_offset())); + memcpy(buffer, sv->ptr, sv->len); + sv->ptr = reinterpret_cast(buffer); + buffer += sv->len; + } +} + +// TODO: codegen this function. +void NewPartitionedAggregationNode::InitAggSlots( + const vector& agg_fn_evals, Tuple* intermediate_tuple) { + vector::const_iterator slot_desc = + intermediate_tuple_desc_->slots().begin() + grouping_exprs_.size(); + for (int i = 0; i < agg_fn_evals.size(); ++i, ++slot_desc) { + // To minimize branching on the UpdateTuple path, initialize the result value so that + // the Add() UDA function can ignore the NULL bit of its destination value. E.g. for + // SUM(), if we initialize the destination value to 0 (with the NULL bit set), we can + // just start adding to the destination value (rather than repeatedly checking the + // destination NULL bit. The codegen'd version of UpdateSlot() exploits this to + // eliminate a branch per value. + // + // For boolean and numeric types, the default values are false/0, so the nullable + // aggregate functions SUM() and AVG() produce the correct result. For MIN()/MAX(), + // initialize the value to max/min possible value for the same effect. + NewAggFnEvaluator* eval = agg_fn_evals[i]; + eval->Init(intermediate_tuple); + + DCHECK(agg_fns_[i] == &(eval->agg_fn())); + const AggFn* agg_fn = agg_fns_[i]; + const AggFn::AggregationOp agg_op = agg_fn->agg_op(); + if ((agg_op == AggFn::MIN || agg_op == AggFn::MAX) && + !agg_fn->intermediate_type().is_string_type() && + !agg_fn->intermediate_type().is_date_type()) { + ExprValue default_value; + void* default_value_ptr = NULL; + if (agg_op == AggFn::MIN) { + default_value_ptr = default_value.set_to_max((*slot_desc)->type()); + } else { + DCHECK_EQ(agg_op, AggFn::MAX); + default_value_ptr = default_value.set_to_min((*slot_desc)->type()); + } + RawValue::write(default_value_ptr, intermediate_tuple, *slot_desc, NULL); + } + } +} + +void NewPartitionedAggregationNode::UpdateTuple(NewAggFnEvaluator** agg_fn_evals, + Tuple* tuple, TupleRow* row, bool is_merge) { + DCHECK(tuple != NULL || agg_fns_.empty()); + for (int i = 0; i < agg_fns_.size(); ++i) { + if (is_merge) { + agg_fn_evals[i]->Merge(row->get_tuple(0), tuple); + } else { + agg_fn_evals[i]->Add(row, tuple); + } + } +} + +Tuple* NewPartitionedAggregationNode::GetOutputTuple( + const vector& agg_fn_evals, Tuple* tuple, MemPool* pool) { + DCHECK(tuple != NULL || agg_fn_evals.empty()) << tuple; + Tuple* dst = tuple; + if (needs_finalize_ && intermediate_tuple_id_ != output_tuple_id_) { + dst = Tuple::create(output_tuple_desc_->byte_size(), pool); + } + if (needs_finalize_) { + NewAggFnEvaluator::Finalize(agg_fn_evals, tuple, dst); + } else { + NewAggFnEvaluator::Serialize(agg_fn_evals, tuple); + } + // Copy grouping values from tuple to dst. + // TODO: Codegen this. + if (dst != tuple) { + int num_grouping_slots = grouping_exprs_.size(); + for (int i = 0; i < num_grouping_slots; ++i) { + SlotDescriptor* src_slot_desc = intermediate_tuple_desc_->slots()[i]; + SlotDescriptor* dst_slot_desc = output_tuple_desc_->slots()[i]; + bool src_slot_null = tuple->is_null(src_slot_desc->null_indicator_offset()); + void* src_slot = NULL; + if (!src_slot_null) src_slot = tuple->get_slot(src_slot_desc->tuple_offset()); + RawValue::write(src_slot, dst, dst_slot_desc, NULL); + } + } + return dst; +} + +template +Status NewPartitionedAggregationNode::AppendSpilledRow( + Partition* partition, TupleRow* row) { + DCHECK(!is_streaming_preagg_); + DCHECK(partition->is_spilled()); + BufferedTupleStream3* stream = AGGREGATED_ROWS ? + partition->aggregated_row_stream.get() : + partition->unaggregated_row_stream.get(); + DCHECK(!stream->is_pinned()); + Status status; + if (LIKELY(stream->AddRow(row, &status))) return Status::OK; + RETURN_IF_ERROR(status); + + // Keep trying to free memory by spilling until we succeed or hit an error. + // Running out of partitions to spill is treated as an error by SpillPartition(). + while (true) { + RETURN_IF_ERROR(SpillPartition(AGGREGATED_ROWS)); + if (stream->AddRow(row, &status)) return Status::OK; + RETURN_IF_ERROR(status); + } +} + +string NewPartitionedAggregationNode::DebugString(int indentation_level) const { + stringstream ss; + DebugString(indentation_level, &ss); + return ss.str(); +} + +void NewPartitionedAggregationNode::DebugString(int indentation_level, + stringstream* out) const { + *out << string(indentation_level * 2, ' '); + *out << "NewPartitionedAggregationNode(" + << "intermediate_tuple_id=" << intermediate_tuple_id_ + << " output_tuple_id=" << output_tuple_id_ + << " needs_finalize=" << needs_finalize_ + << " grouping_exprs=" << Expr::debug_string(grouping_exprs_) + << " agg_exprs=" << AggFn::DebugString(agg_fns_); + ExecNode::debug_string(indentation_level, out); + *out << ")"; +} + +Status NewPartitionedAggregationNode::CreateHashPartitions( + int level, int single_partition_idx) { + if (is_streaming_preagg_) DCHECK_EQ(level, 0); + if (UNLIKELY(level >= MAX_PARTITION_DEPTH)) { + stringstream error_msg; + error_msg << "Cannot perform aggregation at hash aggregation node with id " + << _id << '.' + << " The input data was partitioned the maximum number of " + << MAX_PARTITION_DEPTH << " times." + << " This could mean there is significant skew in the data or the memory limit is" + << " set too low."; + return state_->set_mem_limit_exceeded(error_msg.str()); + } + ht_ctx_->set_level(level); + + DCHECK(hash_partitions_.empty()); + int num_partitions_created = 0; + for (int i = 0; i < PARTITION_FANOUT; ++i) { + hash_tbls_[i] = nullptr; + if (single_partition_idx == -1 || i == single_partition_idx) { + Partition* new_partition = partition_pool_->add(new Partition(this, level, i)); + ++num_partitions_created; + hash_partitions_.push_back(new_partition); + RETURN_IF_ERROR(new_partition->InitStreams()); + } else { + hash_partitions_.push_back(nullptr); + } + } + + // Now that all the streams are reserved (meaning we have enough memory to execute + // the algorithm), allocate the hash tables. These can fail and we can still continue. + for (int i = 0; i < PARTITION_FANOUT; ++i) { + Partition* partition = hash_partitions_[i]; + if (partition == nullptr) continue; + if (partition->aggregated_row_stream == nullptr) { + // Failed to create the aggregated row stream - cannot create a hash table. + // Just continue with a NULL hash table so rows will be passed through. + DCHECK(is_streaming_preagg_); + } else { + bool got_memory; + RETURN_IF_ERROR(partition->InitHashTable(&got_memory)); + // Spill the partition if we cannot create a hash table for a merge aggregation. + if (UNLIKELY(!got_memory)) { + DCHECK(!is_streaming_preagg_) << "Preagg reserves enough memory for hash tables"; + // If we're repartitioning, we will be writing aggregated rows first. + RETURN_IF_ERROR(partition->Spill(level > 0)); + } + } + hash_tbls_[i] = partition->hash_tbl.get(); + } + // In this case we did not have to repartition, so ensure that while building the hash + // table all rows will be inserted into the partition at 'single_partition_idx' in case + // a non deterministic grouping expression causes a row to hash to a different + // partition index. + if (single_partition_idx != -1) { + Partition* partition = hash_partitions_[single_partition_idx]; + for (int i = 0; i < PARTITION_FANOUT; ++i) { + hash_partitions_[i] = partition; + hash_tbls_[i] = partition->hash_tbl.get(); + } + } + + COUNTER_UPDATE(partitions_created_, num_partitions_created); + if (!is_streaming_preagg_) { + COUNTER_SET(max_partition_level_, level); + } + return Status::OK; +} + +Status NewPartitionedAggregationNode::CheckAndResizeHashPartitions( + bool partitioning_aggregated_rows, int num_rows, const NewPartitionedHashTableCtx* ht_ctx) { + DCHECK(!is_streaming_preagg_); + for (int i = 0; i < PARTITION_FANOUT; ++i) { + Partition* partition = hash_partitions_[i]; + if (partition == nullptr) continue; + while (!partition->is_spilled()) { + { + SCOPED_TIMER(ht_resize_timer_); + bool resized; + RETURN_IF_ERROR(partition->hash_tbl->CheckAndResize(num_rows, ht_ctx, &resized)); + if (resized) break; + } + RETURN_IF_ERROR(SpillPartition(partitioning_aggregated_rows)); + } + } + return Status::OK; +} + +Status NewPartitionedAggregationNode::NextPartition() { + DCHECK(output_partition_ == nullptr); + + if (!is_in_subplan() && spilled_partitions_.empty()) { + // All partitions are in memory. Release reservation that was used for previous + // partitions that is no longer needed. If we have spilled partitions, we want to + // hold onto all reservation in case it is needed to process the spilled partitions. + DCHECK(!_buffer_pool_client.has_unpinned_pages()); + Status status = release_unused_reservation(); + DCHECK(status.ok()) << "Should not fail - all partitions are in memory so there are " + << "no unpinned pages. " << status.get_error_msg(); + } + + // Keep looping until we get to a partition that fits in memory. + Partition* partition = nullptr; + while (true) { + // First return partitions that are fully aggregated (and in memory). + if (!aggregated_partitions_.empty()) { + partition = aggregated_partitions_.front(); + DCHECK(!partition->is_spilled()); + aggregated_partitions_.pop_front(); + break; + } + + // No aggregated partitions in memory - we should not be using any reservation aside + // from 'serialize_stream_'. + DCHECK_EQ(serialize_stream_ != nullptr ? serialize_stream_->BytesPinned(false) : 0, + _buffer_pool_client.GetUsedReservation()) << _buffer_pool_client.DebugString(); + + // Try to fit a single spilled partition in memory. We can often do this because + // we only need to fit 1/PARTITION_FANOUT of the data in memory. + // TODO: in some cases when the partition probably won't fit in memory it could + // be better to skip directly to repartitioning. + RETURN_IF_ERROR(BuildSpilledPartition(&partition)); + if (partition != nullptr) break; + + // If we can't fit the partition in memory, repartition it. + RETURN_IF_ERROR(RepartitionSpilledPartition()); + } + DCHECK(!partition->is_spilled()); + DCHECK(partition->hash_tbl.get() != nullptr); + DCHECK(partition->aggregated_row_stream->is_pinned()); + + output_partition_ = partition; + output_iterator_ = output_partition_->hash_tbl->Begin(ht_ctx_.get()); + COUNTER_UPDATE(num_hash_buckets_, output_partition_->hash_tbl->num_buckets()); + return Status::OK; +} + +Status NewPartitionedAggregationNode::BuildSpilledPartition(Partition** built_partition) { + DCHECK(!spilled_partitions_.empty()); + DCHECK(!is_streaming_preagg_); + // Leave the partition in 'spilled_partitions_' to be closed if we hit an error. + Partition* src_partition = spilled_partitions_.front(); + DCHECK(src_partition->is_spilled()); + + // Create a new hash partition from the rows of the spilled partition. This is simpler + // than trying to finish building a partially-built partition in place. We only + // initialise one hash partition that all rows in 'src_partition' will hash to. + RETURN_IF_ERROR(CreateHashPartitions(src_partition->level, src_partition->idx)); + Partition* dst_partition = hash_partitions_[src_partition->idx]; + DCHECK(dst_partition != nullptr); + + // Rebuild the hash table over spilled aggregate rows then start adding unaggregated + // rows to the hash table. It's possible the partition will spill at either stage. + // In that case we need to finish processing 'src_partition' so that all rows are + // appended to 'dst_partition'. + // TODO: if the partition spills again but the aggregation reduces the input + // significantly, we could do better here by keeping the incomplete hash table in + // memory and only spilling unaggregated rows that didn't fit in the hash table + // (somewhat similar to the passthrough pre-aggregation). + RETURN_IF_ERROR(ProcessStream(src_partition->aggregated_row_stream.get())); + RETURN_IF_ERROR(ProcessStream(src_partition->unaggregated_row_stream.get())); + src_partition->Close(false); + spilled_partitions_.pop_front(); + hash_partitions_.clear(); + + if (dst_partition->is_spilled()) { + PushSpilledPartition(dst_partition); + *built_partition = nullptr; + // Spilled the partition - we should not be using any reservation except from + // 'serialize_stream_'. + DCHECK_EQ(serialize_stream_ != nullptr ? serialize_stream_->BytesPinned(false) : 0, + _buffer_pool_client.GetUsedReservation()) << _buffer_pool_client.DebugString(); + } else { + *built_partition = dst_partition; + } + return Status::OK; +} + +Status NewPartitionedAggregationNode::RepartitionSpilledPartition() { + DCHECK(!spilled_partitions_.empty()); + DCHECK(!is_streaming_preagg_); + // Leave the partition in 'spilled_partitions_' to be closed if we hit an error. + Partition* partition = spilled_partitions_.front(); + DCHECK(partition->is_spilled()); + + // Create the new hash partitions to repartition into. This will allocate a + // write buffer for each partition's aggregated row stream. + RETURN_IF_ERROR(CreateHashPartitions(partition->level + 1)); + COUNTER_UPDATE(num_repartitions_, 1); + + // Rows in this partition could have been spilled into two streams, depending + // on if it is an aggregated intermediate, or an unaggregated row. Aggregated + // rows are processed first to save a hash table lookup in ProcessBatch(). + RETURN_IF_ERROR(ProcessStream(partition->aggregated_row_stream.get())); + + // Prepare write buffers so we can append spilled rows to unaggregated partitions. + for (Partition* hash_partition : hash_partitions_) { + if (!hash_partition->is_spilled()) continue; + // The aggregated rows have been repartitioned. Free up at least a buffer's worth of + // reservation and use it to pin the unaggregated write buffer. +// hash_partition->aggregated_row_stream->UnpinStream(BufferedTupleStream3::UNPIN_ALL); + bool got_buffer; + RETURN_IF_ERROR( + hash_partition->unaggregated_row_stream->PrepareForWrite(&got_buffer)); + DCHECK(got_buffer) + << "Accounted in min reservation" << _buffer_pool_client.DebugString(); + } + RETURN_IF_ERROR(ProcessStream(partition->unaggregated_row_stream.get())); + + COUNTER_UPDATE(num_row_repartitioned_, partition->aggregated_row_stream->num_rows()); + COUNTER_UPDATE(num_row_repartitioned_, partition->unaggregated_row_stream->num_rows()); + + partition->Close(false); + spilled_partitions_.pop_front(); + + // Done processing this partition. Move the new partitions into + // spilled_partitions_/aggregated_partitions_. + int64_t num_input_rows = partition->aggregated_row_stream->num_rows() + + partition->unaggregated_row_stream->num_rows(); + RETURN_IF_ERROR(MoveHashPartitions(num_input_rows)); + return Status::OK; +} + +template +Status NewPartitionedAggregationNode::ProcessStream(BufferedTupleStream3* input_stream) { + DCHECK(!is_streaming_preagg_); + if (input_stream->num_rows() > 0) { + while (true) { + bool got_buffer = false; + RETURN_IF_ERROR(input_stream->PrepareForRead(true, &got_buffer)); + if (got_buffer) break; + // Did not have a buffer to read the input stream. Spill and try again. + RETURN_IF_ERROR(SpillPartition(AGGREGATED_ROWS)); + } + + bool eos = false; + const RowDescriptor* desc = + AGGREGATED_ROWS ? &intermediate_row_desc_ : &(_children[0]->row_desc()); + RowBatch batch(*desc, state_->batch_size(), const_cast(mem_tracker())); + do { + RETURN_IF_ERROR(input_stream->GetNext(&batch, &eos)); + RETURN_IF_ERROR( + ProcessBatch(&batch, ht_ctx_.get())); + RETURN_IF_ERROR(state_->check_query_state()); + batch.reset(); + } while (!eos); + } + input_stream->Close(NULL, RowBatch::FlushMode::NO_FLUSH_RESOURCES); + return Status::OK; +} + +Status NewPartitionedAggregationNode::SpillPartition(bool more_aggregate_rows) { + int64_t max_freed_mem = 0; + int partition_idx = -1; + + // Iterate over the partitions and pick the largest partition that is not spilled. + for (int i = 0; i < hash_partitions_.size(); ++i) { + if (hash_partitions_[i] == nullptr) continue; + if (hash_partitions_[i]->is_closed) continue; + if (hash_partitions_[i]->is_spilled()) continue; + // Pass 'true' because we need to keep the write block pinned. See Partition::Spill(). + int64_t mem = hash_partitions_[i]->aggregated_row_stream->BytesPinned(true); + mem += hash_partitions_[i]->hash_tbl->ByteSize(); + mem += hash_partitions_[i]->agg_fn_pool->total_reserved_bytes(); + DCHECK_GT(mem, 0); // At least the hash table buckets should occupy memory. + if (mem > max_freed_mem) { + max_freed_mem = mem; + partition_idx = i; + } + } + DCHECK_NE(partition_idx, -1) << "Should have been able to spill a partition to " + << "reclaim memory: " << _buffer_pool_client.DebugString(); + // Remove references to the destroyed hash table from 'hash_tbls_'. + // Additionally, we might be dealing with a rebuilt spilled partition, where all + // partitions point to a single in-memory partition. This also ensures that 'hash_tbls_' + // remains consistent in that case. + for (int i = 0; i < PARTITION_FANOUT; ++i) { + if (hash_partitions_[i] == hash_partitions_[partition_idx]) hash_tbls_[i] = nullptr; + } + return hash_partitions_[partition_idx]->Spill(more_aggregate_rows); +} + +Status NewPartitionedAggregationNode::MoveHashPartitions(int64_t num_input_rows) { + DCHECK(!hash_partitions_.empty()); + std::stringstream ss; + ss << "PA(node_id=" << id() << ") partitioned(level=" << hash_partitions_[0]->level + << ") " << num_input_rows << " rows into:" << std::endl; + for (int i = 0; i < hash_partitions_.size(); ++i) { + Partition* partition = hash_partitions_[i]; + if (partition == nullptr) continue; + // We might be dealing with a rebuilt spilled partition, where all partitions are + // pointing to a single in-memory partition, so make sure we only proceed for the + // right partition. + if(i != partition->idx) continue; + int64_t aggregated_rows = 0; + if (partition->aggregated_row_stream != nullptr) { + aggregated_rows = partition->aggregated_row_stream->num_rows(); + } + int64_t unaggregated_rows = 0; + if (partition->unaggregated_row_stream != nullptr) { + unaggregated_rows = partition->unaggregated_row_stream->num_rows(); + } + double total_rows = aggregated_rows + unaggregated_rows; + double percent = total_rows * 100 / num_input_rows; + ss << " " << i << " " << (partition->is_spilled() ? "spilled" : "not spilled") + << " (fraction=" << std::fixed << std::setprecision(2) << percent << "%)" << std::endl + << " #aggregated rows:" << aggregated_rows << std::endl + << " #unaggregated rows: " << unaggregated_rows << std::endl; + + // TODO: update counters to support doubles. + COUNTER_SET(largest_partition_percent_, static_cast(percent)); + + if (total_rows == 0) { + partition->Close(false); + } else if (partition->is_spilled()) { + PushSpilledPartition(partition); + } else { + aggregated_partitions_.push_back(partition); + } + + } + VLOG(2) << ss.str(); + hash_partitions_.clear(); + return Status::OK; +} + +void NewPartitionedAggregationNode::PushSpilledPartition(Partition* partition) { + DCHECK(partition->is_spilled()); + DCHECK(partition->hash_tbl == nullptr); + // Ensure all pages in the spilled partition's streams are unpinned by invalidating + // the streams' read and write iterators. We may need all the memory to process the + // next spilled partitions. +// partition->aggregated_row_stream->UnpinStream(BufferedTupleStream3::UNPIN_ALL); +// partition->unaggregated_row_stream->UnpinStream(BufferedTupleStream3::UNPIN_ALL); + spilled_partitions_.push_front(partition); +} + +void NewPartitionedAggregationNode::ClosePartitions() { + for (Partition* partition : hash_partitions_) { + if (partition != nullptr) partition->Close(true); + } + hash_partitions_.clear(); + for (Partition* partition : aggregated_partitions_) partition->Close(true); + aggregated_partitions_.clear(); + for (Partition* partition : spilled_partitions_) partition->Close(true); + spilled_partitions_.clear(); + memset(hash_tbls_, 0, sizeof(hash_tbls_)); + partition_pool_->clear(); +} + +//Status NewPartitionedAggregationNode::QueryMaintenance(RuntimeState* state) { +// NewAggFnEvaluator::FreeLocalAllocations(agg_fn_evals_); +// for (Partition* partition : hash_partitions_) { +// if (partition != nullptr) { +// NewAggFnEvaluator::FreeLocalAllocations(partition->agg_fn_evals); +// } +// } +// if (ht_ctx_.get() != nullptr) ht_ctx_->FreeLocalAllocations(); +// return ExecNode::QueryMaintenance(state); +//} + +#if 0 + +// IR Generation for updating a single aggregation slot. Signature is: +// void UpdateSlot(FunctionContext* agg_fn_ctx, ExprContext* agg_expr_ctx, +// AggTuple* agg_tuple, char** row) +// +// The IR for sum(double_col), which is constructed directly with the IRBuilder, is: +// +// define void @UpdateSlot(%"class.impala_udf::FunctionContext"* %agg_fn_ctx, +// %"class.impala::ExprContext"** %agg_expr_ctxs, +// { i8, [7 x i8], double }* %agg_tuple, %"class.impala::TupleRow"* %row) #34 { +// entry: +// %expr_ctx_ptr = getelementptr %"class.impala::ExprContext"*, +// %"class.impala::ExprContext"** %agg_expr_ctxs, i32 0 +// %expr_ctx = load %"class.impala::ExprContext"*, +// %"class.impala::ExprContext"** %expr_ctx_ptr +// %input0 = call { i8, double } @GetSlotRef(%"class.impala::ExprContext"* %expr_ctx, +// %"class.impala::TupleRow"* %row) +// %dst_slot_ptr = getelementptr inbounds { i8, [7 x i8], double }, +// { i8, [7 x i8], double }* %agg_tuple, i32 0, i32 2 +// %dst_val = load double, double* %dst_slot_ptr +// %0 = extractvalue { i8, double } %input0, 0 +// %is_null = trunc i8 %0 to i1 +// br i1 %is_null, label %ret, label %not_null +// +// ret: ; preds = %not_null, %entry +// ret void +// +// not_null: ; preds = %entry +// %val = extractvalue { i8, double } %input0, 1 +// %1 = fadd double %dst_val, %val +// %2 = bitcast { i8, [7 x i8], double }* %agg_tuple to i8* +// %null_byte_ptr = getelementptr i8, i8* %2, i32 0 +// %null_byte = load i8, i8* %null_byte_ptr +// %null_bit_cleared = and i8 %null_byte, -2 +// store i8 %null_bit_cleared, i8* %null_byte_ptr +// store double %1, double* %dst_slot_ptr +// br label %ret +// } +// +// The IR for min(timestamp_col), which uses the UDA interface, is: +// +// define void @UpdateSlot(%"class.impala_udf::FunctionContext"* %agg_fn_ctx, +// %"class.impala::ExprContext"** %agg_expr_ctxs, +// { i8, [7 x i8], %"class.impala::TimestampValue" }* %agg_tuple, +// %"class.impala::TupleRow"* %row) #34 { +// entry: +// %dst_lowered_ptr = alloca { i64, i64 } +// %input_lowered_ptr = alloca { i64, i64 } +// %expr_ctx_ptr = getelementptr %"class.impala::ExprContext"*, +// %"class.impala::ExprContext"** %agg_expr_ctxs, i32 0 +// %expr_ctx = load %"class.impala::ExprContext"*, +// %"class.impala::ExprContext"** %expr_ctx_ptr +// %input0 = call { i64, i64 } @GetSlotRef(%"class.impala::ExprContext"* %expr_ctx, +// %"class.impala::TupleRow"* %row) +// %dst_slot_ptr = getelementptr inbounds { i8, [7 x i8], +// %"class.impala::TimestampValue" }, { i8, [7 x i8], +// %"class.impala::TimestampValue" }* %agg_tuple, i32 0, i32 2 +// %dst_val = load %"class.impala::TimestampValue", +// %"class.impala::TimestampValue"* %dst_slot_ptr +// %0 = bitcast { i8, [7 x i8], %"class.impala::TimestampValue" }* %agg_tuple to i8* +// %null_byte_ptr = getelementptr i8, i8* %0, i32 0 +// %null_byte = load i8, i8* %null_byte_ptr +// %null_mask = and i8 %null_byte, 1 +// %is_null = icmp ne i8 %null_mask, 0 +// %is_null_ext = zext i1 %is_null to i64 +// %1 = or i64 0, %is_null_ext +// %dst = insertvalue { i64, i64 } zeroinitializer, i64 %1, 0 +// %time_of_day = extractvalue %"class.impala::TimestampValue" %dst_val, 0, 0, 0, 0 +// %dst1 = insertvalue { i64, i64 } %dst, i64 %time_of_day, 1 +// %date = extractvalue %"class.impala::TimestampValue" %dst_val, 1, 0, 0 +// %2 = extractvalue { i64, i64 } %dst1, 0 +// %3 = zext i32 %date to i64 +// %4 = shl i64 %3, 32 +// %5 = and i64 %2, 4294967295 +// %6 = or i64 %5, %4 +// %dst2 = insertvalue { i64, i64 } %dst1, i64 %6, 0 +// store { i64, i64 } %input0, { i64, i64 }* %input_lowered_ptr +// %input_unlowered_ptr = bitcast { i64, i64 }* %input_lowered_ptr +// to %"struct.impala_udf::TimestampVal"* +// store { i64, i64 } %dst2, { i64, i64 }* %dst_lowered_ptr +// %dst_unlowered_ptr = bitcast { i64, i64 }* %dst_lowered_ptr +// to %"struct.impala_udf::TimestampVal"* +// call void +// @_ZN6impala18AggregateFunctions3MinIN10impala_udf12TimestampValEEEvPNS2_15FunctionContextERKT_PS6_.2( +// %"class.impala_udf::FunctionContext"* %agg_fn_ctx, +// %"struct.impala_udf::TimestampVal"* %input_unlowered_ptr, +// %"struct.impala_udf::TimestampVal"* %dst_unlowered_ptr) +// %anyval_result = load { i64, i64 }, { i64, i64 }* %dst_lowered_ptr +// %7 = extractvalue { i64, i64 } %anyval_result, 1 +// %8 = insertvalue %"class.impala::TimestampValue" zeroinitializer, i64 %7, 0, 0, 0, 0 +// %9 = extractvalue { i64, i64 } %anyval_result, 0 +// %10 = ashr i64 %9, 32 +// %11 = trunc i64 %10 to i32 +// %12 = insertvalue %"class.impala::TimestampValue" %8, i32 %11, 1, 0, 0 +// %13 = extractvalue { i64, i64 } %anyval_result, 0 +// %result_is_null = trunc i64 %13 to i1 +// %14 = bitcast { i8, [7 x i8], %"class.impala::TimestampValue" }* %agg_tuple to i8* +// %null_byte_ptr3 = getelementptr i8, i8* %14, i32 0 +// %null_byte4 = load i8, i8* %null_byte_ptr3 +// %null_bit_cleared = and i8 %null_byte4, -2 +// %15 = sext i1 %result_is_null to i8 +// %null_bit = and i8 %15, 1 +// %null_bit_set = or i8 %null_bit_cleared, %null_bit +// store i8 %null_bit_set, i8* %null_byte_ptr3 +// store %"class.impala::TimestampValue" %12, +// %"class.impala::TimestampValue"* %dst_slot_ptr +// br label %ret +// +// ret: ; preds = %entry +// ret void +// } +// +//Status NewPartitionedAggregationNode::CodegenUpdateSlot(LlvmCodeGen* codegen, +// NewAggFnEvaluator* evaluator, int evaluator_idx, SlotDescriptor* slot_desc, +// Function** fn) { +// PointerType* fn_ctx_type = +// codegen->GetPtrType(FunctionContextImpl::LLVM_FUNCTIONCONTEXT_NAME); +// PointerType* expr_ctxs_type = +// codegen->GetPtrPtrType(codegen->GetType(ExprContext::LLVM_CLASS_NAME)); +// StructType* tuple_struct = intermediate_tuple_desc_->GetLlvmStruct(codegen); +// if (tuple_struct == NULL) { +// return Status("NewPartitionedAggregationNode::CodegenUpdateSlot(): failed to generate " +// "intermediate tuple desc"); +// } +// PointerType* tuple_ptr_type = codegen->GetPtrType(tuple_struct); +// PointerType* tuple_row_ptr_type = codegen->GetPtrType(TupleRow::LLVM_CLASS_NAME); +// +// // Create UpdateSlot prototype +// LlvmCodeGen::FnPrototype prototype(codegen, "UpdateSlot", codegen->void_type()); +// prototype.AddArgument(LlvmCodeGen::NamedVariable("agg_fn_ctx", fn_ctx_type)); +// prototype.AddArgument(LlvmCodeGen::NamedVariable("agg_expr_ctxs", expr_ctxs_type)); +// prototype.AddArgument(LlvmCodeGen::NamedVariable("agg_tuple", tuple_ptr_type)); +// prototype.AddArgument(LlvmCodeGen::NamedVariable("row", tuple_row_ptr_type)); +// +// LlvmBuilder builder(codegen->context()); +// Value* args[4]; +// *fn = prototype.GeneratePrototype(&builder, &args[0]); +// Value* agg_fn_ctx_arg = args[0]; +// Value* agg_expr_ctxs_arg = args[1]; +// Value* agg_tuple_arg = args[2]; +// Value* row_arg = args[3]; +// +// DCHECK_GE(evaluator->input_expr_ctxs().size(), 1); +// vector input_vals; +// for (int i = 0; i < evaluator->input_expr_ctxs().size(); ++i) { +// ExprContext* agg_expr_ctx = evaluator->input_expr_ctxs()[i]; +// Expr* agg_expr = agg_expr_ctx->root(); +// Function* agg_expr_fn; +// RETURN_IF_ERROR(agg_expr->GetCodegendComputeFn(codegen, &agg_expr_fn)); +// DCHECK(agg_expr_fn != NULL); +// +// // Call expr function with the matching expr context to get src slot value. +// Value* expr_ctx_ptr = builder.CreateInBoundsGEP( +// agg_expr_ctxs_arg, codegen->GetIntConstant(TYPE_INT, i), "expr_ctx_ptr"); +// Value* expr_ctx = builder.CreateLoad(expr_ctx_ptr, "expr_ctx"); +// string input_name = Substitute("input$0", i); +// input_vals.push_back( +// CodegenAnyVal::CreateCallWrapped(codegen, &builder, agg_expr->type(), agg_expr_fn, +// ArrayRef({expr_ctx, row_arg}), input_name.c_str())); +// } +// +// NewAggFnEvaluator::AggregationOp agg_op = evaluator->agg_op(); +// const ColumnType& dst_type = evaluator->intermediate_type(); +// bool dst_is_int_or_float_or_bool = dst_type.IsIntegerType() +// || dst_type.IsFloatingPointType() || dst_type.IsBooleanType(); +// bool dst_is_numeric_or_bool = dst_is_int_or_float_or_bool || dst_type.IsDecimalType(); +// +// BasicBlock* ret_block = BasicBlock::Create(codegen->context(), "ret", *fn); +// +// // Emit the code to compute 'result' and set the NULL indicator if needed. First check +// // for special cases where we can emit a very simple instruction sequence, then fall +// // back to the general-purpose approach of calling the cross-compiled builtin UDA. +// CodegenAnyVal& src = input_vals[0]; +// // 'dst_slot_ptr' points to the slot in the aggregate tuple to update. +// Value* dst_slot_ptr = builder.CreateStructGEP( +// NULL, agg_tuple_arg, slot_desc->llvm_field_idx(), "dst_slot_ptr"); +// Value* result = NULL; +// Value* dst_value = builder.CreateLoad(dst_slot_ptr, "dst_val"); +// if (agg_op == NewAggFnEvaluator::COUNT) { +// src.CodegenBranchIfNull(&builder, ret_block); +// if (evaluator->is_merge()) { +// result = builder.CreateAdd(dst_value, src.GetVal(), "count_sum"); +// } else { +// result = builder.CreateAdd( +// dst_value, codegen->GetIntConstant(TYPE_BIGINT, 1), "count_inc"); +// } +// DCHECK(!slot_desc->is_nullable()); +// } else if ((agg_op == NewAggFnEvaluator::MIN || agg_op == NewAggFnEvaluator::MAX) +// && dst_is_numeric_or_bool) { +// bool is_min = agg_op == NewAggFnEvaluator::MIN; +// src.CodegenBranchIfNull(&builder, ret_block); +// Function* min_max_fn = codegen->CodegenMinMax(slot_desc->type(), is_min); +// Value* min_max_args[] = {dst_value, src.GetVal()}; +// result = +// builder.CreateCall(min_max_fn, min_max_args, is_min ? "min_value" : "max_value"); +// // Dst may have been NULL, make sure to unset the NULL bit. +// DCHECK(slot_desc->is_nullable()); +// slot_desc->CodegenSetNullIndicator( +// codegen, &builder, agg_tuple_arg, codegen->false_value()); +// } else if (agg_op == NewAggFnEvaluator::SUM && dst_is_int_or_float_or_bool) { +// src.CodegenBranchIfNull(&builder, ret_block); +// if (dst_type.IsFloatingPointType()) { +// result = builder.CreateFAdd(dst_value, src.GetVal()); +// } else { +// result = builder.CreateAdd(dst_value, src.GetVal()); +// } +// // Dst may have been NULL, make sure to unset the NULL bit. +// DCHECK(slot_desc->is_nullable()); +// slot_desc->CodegenSetNullIndicator( +// codegen, &builder, agg_tuple_arg, codegen->false_value()); +// } else { +// // The remaining cases are implemented using the UDA interface. +// // Create intermediate argument 'dst' from 'dst_value' +// CodegenAnyVal dst = CodegenAnyVal::GetNonNullVal(codegen, &builder, dst_type, "dst"); +// +// // For a subset of builtins we generate a different code sequence that exploits two +// // properties of the builtins. First, NULL input values can be skipped. Second, the +// // value of the slot was initialized in the right way in InitAggSlots() (e.g. 0 for +// // SUM) that we get the right result if UpdateSlot() pretends that the NULL bit of +// // 'dst' is unset. Empirically this optimisation makes TPC-H Q1 5-10% faster. +// bool special_null_handling = !evaluator->intermediate_type().IsStringType() +// && !evaluator->intermediate_type().IsTimestampType() +// && (agg_op == NewAggFnEvaluator::MIN || agg_op == NewAggFnEvaluator::MAX +// || agg_op == NewAggFnEvaluator::SUM || agg_op == NewAggFnEvaluator::AVG +// || agg_op == NewAggFnEvaluator::NDV); +// if (slot_desc->is_nullable()) { +// if (special_null_handling) { +// src.CodegenBranchIfNull(&builder, ret_block); +// slot_desc->CodegenSetNullIndicator( +// codegen, &builder, agg_tuple_arg, codegen->false_value()); +// } else { +// dst.SetIsNull(slot_desc->CodegenIsNull(codegen, &builder, agg_tuple_arg)); +// } +// } +// dst.SetFromRawValue(dst_value); +// +// // Call the UDA to update/merge 'src' into 'dst', with the result stored in +// // 'updated_dst_val'. +// CodegenAnyVal updated_dst_val; +// RETURN_IF_ERROR(CodegenCallUda(codegen, &builder, evaluator, agg_fn_ctx_arg, +// input_vals, dst, &updated_dst_val)); +// result = updated_dst_val.ToNativeValue(); +// +// if (slot_desc->is_nullable() && !special_null_handling) { +// // Set NULL bit in the slot based on the return value. +// Value* result_is_null = updated_dst_val.GetIsNull("result_is_null"); +// slot_desc->CodegenSetNullIndicator( +// codegen, &builder, agg_tuple_arg, result_is_null); +// } +// } +// +// // TODO: Store to register in the loop and store once to memory at the end of the loop. +// builder.CreateStore(result, dst_slot_ptr); +// builder.CreateBr(ret_block); +// +// builder.SetInsertPoint(ret_block); +// builder.CreateRetVoid(); +// +// // Avoid producing huge UpdateTuple() function after inlining - LLVM's optimiser +// // memory/CPU usage scales super-linearly with function size. +// // E.g. compute stats on all columns of a 1000-column table previously took 4 minutes to +// // codegen because all the UpdateSlot() functions were inlined. +// if (evaluator_idx >= LlvmCodeGen::CODEGEN_INLINE_EXPRS_THRESHOLD) { +// codegen->SetNoInline(*fn); +// } +// +// *fn = codegen->FinalizeFunction(*fn); +// if (*fn == NULL) { +// return Status("NewPartitionedAggregationNode::CodegenUpdateSlot(): codegen'd " +// "UpdateSlot() function failed verification, see log"); +// } +// return Status::OK; +//} +// +//Status NewPartitionedAggregationNode::CodegenCallUda(LlvmCodeGen* codegen, +// LlvmBuilder* builder, NewAggFnEvaluator* evaluator, Value* agg_fn_ctx_arg, +// const vector& input_vals, const CodegenAnyVal& dst, +// CodegenAnyVal* updated_dst_val) { +// DCHECK_EQ(evaluator->input_expr_ctxs().size(), input_vals.size()); +// Function* uda_fn; +// RETURN_IF_ERROR(evaluator->GetUpdateOrMergeFunction(codegen, &uda_fn)); +// +// // Set up arguments for call to UDA, which are the FunctionContext*, followed by +// // pointers to all input values, followed by a pointer to the destination value. +// vector uda_fn_args; +// uda_fn_args.push_back(agg_fn_ctx_arg); +// +// // Create pointers to input args to pass to uda_fn. We must use the unlowered type, +// // e.g. IntVal, because the UDA interface expects the values to be passed as const +// // references to the classes. +// for (int i = 0; i < evaluator->input_expr_ctxs().size(); ++i) { +// uda_fn_args.push_back(input_vals[i].GetUnloweredPtr("input_unlowered_ptr")); +// } +// +// // Create pointer to dst to pass to uda_fn. We must use the unlowered type for the +// // same reason as above. +// Value* dst_lowered_ptr = dst.GetLoweredPtr("dst_lowered_ptr"); +// const ColumnType& dst_type = evaluator->intermediate_type(); +// Type* dst_unlowered_ptr_type = CodegenAnyVal::GetUnloweredPtrType(codegen, dst_type); +// Value* dst_unlowered_ptr = builder->CreateBitCast( +// dst_lowered_ptr, dst_unlowered_ptr_type, "dst_unlowered_ptr"); +// uda_fn_args.push_back(dst_unlowered_ptr); +// +// // Call 'uda_fn' +// builder->CreateCall(uda_fn, uda_fn_args); +// +// // Convert intermediate 'dst_arg' back to the native type. +// Value* anyval_result = builder->CreateLoad(dst_lowered_ptr, "anyval_result"); +// +// *updated_dst_val = CodegenAnyVal(codegen, builder, dst_type, anyval_result); +// return Status::OK; +//} + +// IR codegen for the UpdateTuple loop. This loop is query specific and based on the +// aggregate functions. The function signature must match the non- codegen'd UpdateTuple +// exactly. +// For the query: +// select count(*), count(int_col), sum(double_col) the IR looks like: +// +// ; Function Attrs: alwaysinline +// define void @UpdateTuple(%"class.impala::NewPartitionedAggregationNode"* %this_ptr, +// %"class.impala_udf::FunctionContext"** %agg_fn_ctxs, %"class.impala::Tuple"* +// %tuple, +// %"class.impala::TupleRow"* %row, i1 %is_merge) #34 { +// entry: +// %tuple1 = +// bitcast %"class.impala::Tuple"* %tuple to { i8, [7 x i8], i64, i64, double }* +// %src_slot = getelementptr inbounds { i8, [7 x i8], i64, i64, double }, +// { i8, [7 x i8], i64, i64, double }* %tuple1, i32 0, i32 2 +// %count_star_val = load i64, i64* %src_slot +// %count_star_inc = add i64 %count_star_val, 1 +// store i64 %count_star_inc, i64* %src_slot +// %0 = getelementptr %"class.impala_udf::FunctionContext"*, +// %"class.impala_udf::FunctionContext"** %agg_fn_ctxs, i32 1 +// %agg_fn_ctx = load %"class.impala_udf::FunctionContext"*, +// %"class.impala_udf::FunctionContext"** %0 +// %1 = call %"class.impala::ExprContext"** +// @_ZNK6impala26NewPartitionedAggregationNode18GetAggExprContextsEi( +// %"class.impala::NewPartitionedAggregationNode"* %this_ptr, i32 1) +// call void @UpdateSlot(%"class.impala_udf::FunctionContext"* %agg_fn_ctx, +// %"class.impala::ExprContext"** %1, { i8, [7 x i8], i64, i64, double }* %tuple1, +// %"class.impala::TupleRow"* %row) +// %2 = getelementptr %"class.impala_udf::FunctionContext"*, +// %"class.impala_udf::FunctionContext"** %agg_fn_ctxs, i32 2 +// %agg_fn_ctx2 = load %"class.impala_udf::FunctionContext"*, +// %"class.impala_udf::FunctionContext"** %2 +// %3 = call %"class.impala::ExprContext"** +// @_ZNK6impala26NewPartitionedAggregationNode18GetAggExprContextsEi( +// %"class.impala::NewPartitionedAggregationNode"* %this_ptr, i32 2) +// call void @UpdateSlot.4(%"class.impala_udf::FunctionContext"* %agg_fn_ctx2, +// %"class.impala::ExprContext"** %3, { i8, [7 x i8], i64, i64, double }* %tuple1, +// %"class.impala::TupleRow"* %row) +// ret void +// } +//Status NewPartitionedAggregationNode::CodegenUpdateTuple( +// LlvmCodeGen* codegen, Function** fn) { +// SCOPED_TIMER(codegen->codegen_timer()); +// +// for (const SlotDescriptor* slot_desc : intermediate_tuple_desc_->slots()) { +// if (slot_desc->type().type == TYPE_CHAR) { +// return Status("NewPartitionedAggregationNode::CodegenUpdateTuple(): cannot codegen" +// "CHAR in aggregations"); +// } +// } +// +// if (intermediate_tuple_desc_->GetLlvmStruct(codegen) == NULL) { +// return Status("NewPartitionedAggregationNode::CodegenUpdateTuple(): failed to generate " +// "intermediate tuple desc"); +// } +// +// // Get the types to match the UpdateTuple signature +// Type* agg_node_type = codegen->GetType(NewPartitionedAggregationNode::LLVM_CLASS_NAME); +// Type* fn_ctx_type = codegen->GetType(FunctionContextImpl::LLVM_FUNCTIONCONTEXT_NAME); +// Type* tuple_type = codegen->GetType(Tuple::LLVM_CLASS_NAME); +// Type* tuple_row_type = codegen->GetType(TupleRow::LLVM_CLASS_NAME); +// +// PointerType* agg_node_ptr_type = codegen->GetPtrType(agg_node_type); +// PointerType* fn_ctx_ptr_ptr_type = codegen->GetPtrPtrType(fn_ctx_type); +// PointerType* tuple_ptr_type = codegen->GetPtrType(tuple_type); +// PointerType* tuple_row_ptr_type = codegen->GetPtrType(tuple_row_type); +// +// StructType* tuple_struct = intermediate_tuple_desc_->GetLlvmStruct(codegen); +// PointerType* tuple_ptr = codegen->GetPtrType(tuple_struct); +// LlvmCodeGen::FnPrototype prototype(codegen, "UpdateTuple", codegen->void_type()); +// prototype.AddArgument(LlvmCodeGen::NamedVariable("this_ptr", agg_node_ptr_type)); +// prototype.AddArgument(LlvmCodeGen::NamedVariable("agg_fn_ctxs", fn_ctx_ptr_ptr_type)); +// prototype.AddArgument(LlvmCodeGen::NamedVariable("tuple", tuple_ptr_type)); +// prototype.AddArgument(LlvmCodeGen::NamedVariable("row", tuple_row_ptr_type)); +// prototype.AddArgument(LlvmCodeGen::NamedVariable("is_merge", codegen->boolean_type())); +// +// LlvmBuilder builder(codegen->context()); +// Value* args[5]; +// *fn = prototype.GeneratePrototype(&builder, &args[0]); +// Value* this_arg = args[0]; +// Value* agg_fn_ctxs_arg = args[1]; +// Value* tuple_arg = args[2]; +// Value* row_arg = args[3]; +// +// // Cast the parameter types to the internal llvm runtime types. +// // TODO: get rid of this by using right type in function signature +// tuple_arg = builder.CreateBitCast(tuple_arg, tuple_ptr, "tuple"); +// +// Function* get_expr_ctxs_fn = +// codegen->GetFunction(IRFunction::PART_AGG_NODE_GET_EXPR_CTXS, false); +// DCHECK(get_expr_ctxs_fn != NULL); +// +// // Loop over each expr and generate the IR for that slot. If the expr is not +// // count(*), generate a helper IR function to update the slot and call that. +// int j = grouping_expr_ctxs_.size(); +// for (int i = 0; i < aggregate_evaluators_.size(); ++i, ++j) { +// SlotDescriptor* slot_desc = intermediate_tuple_desc_->slots()[j]; +// NewAggFnEvaluator* evaluator = aggregate_evaluators_[i]; +// if (evaluator->is_count_star()) { +// // TODO: we should be able to hoist this up to the loop over the batch and just +// // increment the slot by the number of rows in the batch. +// int field_idx = slot_desc->llvm_field_idx(); +// Value* const_one = codegen->GetIntConstant(TYPE_BIGINT, 1); +// Value* slot_ptr = builder.CreateStructGEP(NULL, tuple_arg, field_idx, "src_slot"); +// Value* slot_loaded = builder.CreateLoad(slot_ptr, "count_star_val"); +// Value* count_inc = builder.CreateAdd(slot_loaded, const_one, "count_star_inc"); +// builder.CreateStore(count_inc, slot_ptr); +// } else { +// Function* update_slot_fn; +// RETURN_IF_ERROR( +// CodegenUpdateSlot(codegen, evaluator, i, slot_desc, &update_slot_fn)); +// Value* agg_fn_ctx_ptr = builder.CreateConstGEP1_32(agg_fn_ctxs_arg, i); +// Value* agg_fn_ctx = builder.CreateLoad(agg_fn_ctx_ptr, "agg_fn_ctx"); +// // Call GetExprCtx() to get the expression context. +// DCHECK(agg_expr_ctxs_[i] != NULL); +// Value* get_expr_ctxs_args[] = {this_arg, codegen->GetIntConstant(TYPE_INT, i)}; +// Value* agg_expr_ctxs = builder.CreateCall(get_expr_ctxs_fn, get_expr_ctxs_args); +// Value* update_slot_args[] = {agg_fn_ctx, agg_expr_ctxs, tuple_arg, row_arg}; +// builder.CreateCall(update_slot_fn, update_slot_args); +// } +// } +// builder.CreateRetVoid(); +// +// // Avoid inlining big UpdateTuple function into outer loop - we're unlikely to get +// // any benefit from it since the function call overhead will be amortized. +// if (aggregate_evaluators_.size() > LlvmCodeGen::CODEGEN_INLINE_EXPR_BATCH_THRESHOLD) { +// codegen->SetNoInline(*fn); +// } +// +// // CodegenProcessBatch() does the final optimizations. +// *fn = codegen->FinalizeFunction(*fn); +// if (*fn == NULL) { +// return Status("NewPartitionedAggregationNode::CodegenUpdateTuple(): codegen'd " +// "UpdateTuple() function failed verification, see log"); +// } +// return Status::OK; +//} +// +//Status NewPartitionedAggregationNode::CodegenProcessBatch(LlvmCodeGen* codegen, +// TPrefetchMode::type prefetch_mode) { +// SCOPED_TIMER(codegen->codegen_timer()); +// +// Function* update_tuple_fn; +// RETURN_IF_ERROR(CodegenUpdateTuple(codegen, &update_tuple_fn)); +// +// // Get the cross compiled update row batch function +// IRFunction::Type ir_fn = (!grouping_expr_ctxs_.empty() ? +// IRFunction::PART_AGG_NODE_PROCESS_BATCH_UNAGGREGATED : +// IRFunction::PART_AGG_NODE_PROCESS_BATCH_NO_GROUPING); +// Function* process_batch_fn = codegen->GetFunction(ir_fn, true); +// DCHECK(process_batch_fn != NULL); +// +// int replaced; +// if (!grouping_expr_ctxs_.empty()) { +// // Codegen for grouping using hash table +// +// // Replace prefetch_mode with constant so branches can be optimised out. +// Value* prefetch_mode_arg = codegen->GetArgument(process_batch_fn, 3); +// prefetch_mode_arg->replaceAllUsesWith( +// ConstantInt::get(Type::getInt32Ty(codegen->context()), prefetch_mode)); +// +// // The codegen'd ProcessBatch function is only used in Open() with level_ = 0, +// // so don't use murmur hash +// Function* hash_fn; +// RETURN_IF_ERROR(ht_ctx_->CodegenHashRow(codegen, /* use murmur */ false, &hash_fn)); +// +// // Codegen HashTable::Equals +// Function* build_equals_fn; +// RETURN_IF_ERROR(ht_ctx_->CodegenEquals(codegen, true, &build_equals_fn)); +// +// // Codegen for evaluating input rows +// Function* eval_grouping_expr_fn; +// RETURN_IF_ERROR(ht_ctx_->CodegenEvalRow(codegen, false, &eval_grouping_expr_fn)); +// +// // Replace call sites +// replaced = codegen->ReplaceCallSites(process_batch_fn, eval_grouping_expr_fn, +// "EvalProbeRow"); +// DCHECK_EQ(replaced, 1); +// +// replaced = codegen->ReplaceCallSites(process_batch_fn, hash_fn, "HashRow"); +// DCHECK_EQ(replaced, 1); +// +// replaced = codegen->ReplaceCallSites(process_batch_fn, build_equals_fn, "Equals"); +// DCHECK_EQ(replaced, 1); +// +// NewPartitionedHashTableCtx::HashTableReplacedConstants replaced_constants; +// const bool stores_duplicates = false; +// RETURN_IF_ERROR(ht_ctx_->ReplaceHashTableConstants(codegen, stores_duplicates, 1, +// process_batch_fn, &replaced_constants)); +// DCHECK_GE(replaced_constants.stores_nulls, 1); +// DCHECK_GE(replaced_constants.finds_some_nulls, 1); +// DCHECK_GE(replaced_constants.stores_duplicates, 1); +// DCHECK_GE(replaced_constants.stores_tuples, 1); +// DCHECK_GE(replaced_constants.quadratic_probing, 1); +// } +// +// replaced = codegen->ReplaceCallSites(process_batch_fn, update_tuple_fn, "UpdateTuple"); +// DCHECK_GE(replaced, 1); +// process_batch_fn = codegen->FinalizeFunction(process_batch_fn); +// if (process_batch_fn == NULL) { +// return Status("NewPartitionedAggregationNode::CodegenProcessBatch(): codegen'd " +// "ProcessBatch() function failed verification, see log"); +// } +// +// void **codegened_fn_ptr = grouping_expr_ctxs_.empty() ? +// reinterpret_cast(&process_batch_no_grouping_fn_) : +// reinterpret_cast(&process_batch_fn_); +// codegen->AddFunctionToJit(process_batch_fn, codegened_fn_ptr); +// return Status::OK; +//} +// +//Status NewPartitionedAggregationNode::CodegenProcessBatchStreaming( +// LlvmCodeGen* codegen, TPrefetchMode::type prefetch_mode) { +// DCHECK(is_streaming_preagg_); +// SCOPED_TIMER(codegen->codegen_timer()); +// +// IRFunction::Type ir_fn = IRFunction::PART_AGG_NODE_PROCESS_BATCH_STREAMING; +// Function* process_batch_streaming_fn = codegen->GetFunction(ir_fn, true); +// DCHECK(process_batch_streaming_fn != NULL); +// +// // Make needs_serialize arg constant so dead code can be optimised out. +// Value* needs_serialize_arg = codegen->GetArgument(process_batch_streaming_fn, 2); +// needs_serialize_arg->replaceAllUsesWith( +// ConstantInt::get(Type::getInt1Ty(codegen->context()), needs_serialize_)); +// +// // Replace prefetch_mode with constant so branches can be optimised out. +// Value* prefetch_mode_arg = codegen->GetArgument(process_batch_streaming_fn, 3); +// prefetch_mode_arg->replaceAllUsesWith( +// ConstantInt::get(Type::getInt32Ty(codegen->context()), prefetch_mode)); +// +// Function* update_tuple_fn; +// RETURN_IF_ERROR(CodegenUpdateTuple(codegen, &update_tuple_fn)); +// +// // We only use the top-level hash function for streaming aggregations. +// Function* hash_fn; +// RETURN_IF_ERROR(ht_ctx_->CodegenHashRow(codegen, false, &hash_fn)); +// +// // Codegen HashTable::Equals +// Function* equals_fn; +// RETURN_IF_ERROR(ht_ctx_->CodegenEquals(codegen, true, &equals_fn)); +// +// // Codegen for evaluating input rows +// Function* eval_grouping_expr_fn; +// RETURN_IF_ERROR(ht_ctx_->CodegenEvalRow(codegen, false, &eval_grouping_expr_fn)); +// +// // Replace call sites +// int replaced = codegen->ReplaceCallSites(process_batch_streaming_fn, update_tuple_fn, +// "UpdateTuple"); +// DCHECK_EQ(replaced, 2); +// +// replaced = codegen->ReplaceCallSites(process_batch_streaming_fn, eval_grouping_expr_fn, +// "EvalProbeRow"); +// DCHECK_EQ(replaced, 1); +// +// replaced = codegen->ReplaceCallSites(process_batch_streaming_fn, hash_fn, "HashRow"); +// DCHECK_EQ(replaced, 1); +// +// replaced = codegen->ReplaceCallSites(process_batch_streaming_fn, equals_fn, "Equals"); +// DCHECK_EQ(replaced, 1); +// +// NewPartitionedHashTableCtx::HashTableReplacedConstants replaced_constants; +// const bool stores_duplicates = false; +// RETURN_IF_ERROR(ht_ctx_->ReplaceHashTableConstants(codegen, stores_duplicates, 1, +// process_batch_streaming_fn, &replaced_constants)); +// DCHECK_GE(replaced_constants.stores_nulls, 1); +// DCHECK_GE(replaced_constants.finds_some_nulls, 1); +// DCHECK_GE(replaced_constants.stores_duplicates, 1); +// DCHECK_GE(replaced_constants.stores_tuples, 1); +// DCHECK_GE(replaced_constants.quadratic_probing, 1); +// +// DCHECK(process_batch_streaming_fn != NULL); +// process_batch_streaming_fn = codegen->FinalizeFunction(process_batch_streaming_fn); +// if (process_batch_streaming_fn == NULL) { +// return Status("NewPartitionedAggregationNode::CodegenProcessBatchStreaming(): codegen'd " +// "ProcessBatchStreaming() function failed verification, see log"); +// } +// +// codegen->AddFunctionToJit(process_batch_streaming_fn, +// reinterpret_cast(&process_batch_streaming_fn_)); +// return Status::OK; +//} + +#endif + +// Instantiate required templates. +template Status NewPartitionedAggregationNode::AppendSpilledRow( + Partition*, TupleRow*); +template Status NewPartitionedAggregationNode::AppendSpilledRow(Partition*, TupleRow*); + +} + diff --git a/be/src/exec/new_partitioned_aggregation_node.h b/be/src/exec/new_partitioned_aggregation_node.h new file mode 100644 index 0000000000..b0c699f3ee --- /dev/null +++ b/be/src/exec/new_partitioned_aggregation_node.h @@ -0,0 +1,743 @@ +// Modifications copyright (C) 2017, Baidu.com, Inc. +// Copyright 2017 The Apache Software Foundation + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef BDG_PALO_BE_SRC_EXEC_NEW_PARTITIONED_AGGREGATION_NODE_H +#define BDG_PALO_BE_SRC_EXEC_NEW_PARTITIONED_AGGREGATION_NODE_H + +#include + +#include + +#include "exec/exec_node.h" +#include "exec/new_partitioned_hash_table.h" +#include "runtime/buffered_tuple_stream3.h" +#include "runtime/bufferpool/suballocator.h" +#include "runtime/descriptors.h" // for TupleId +#include "runtime/mem_pool.h" +#include "runtime/string_value.h" + +namespace llvm { +// class BasicBlock; +class Function; +// class Value; +} + +namespace palo { + +class AggFn; +class NewAggFnEvaluator; +class CodegenAnyVal; +//class LlvmCodeGen; +//class LlvmBuilder; +class RowBatch; +class RuntimeState; +struct StringValue; +class Tuple; +class TupleDescriptor; +class SlotDescriptor; + +/// Node for doing partitioned hash aggregation. +/// This node consumes the input (which can be from the child(0) or a spilled partition). +/// 1. Each row is hashed and we pick a dst partition (hash_partitions_). +/// 2. If the dst partition is not spilled, we probe into the partitions hash table +/// to aggregate/insert the row. +/// 3. If the partition is already spilled, the input row is spilled. +/// 4. When all the input is consumed, we walk hash_partitions_, put the spilled ones +/// into spilled_partitions_ and the non-spilled ones into aggregated_partitions_. +/// aggregated_partitions_ contain partitions that are fully processed and the result +/// can just be returned. Partitions in spilled_partitions_ need to be repartitioned +/// and we just repeat these steps. +// +/// Each partition contains these structures: +/// 1) Hash Table for aggregated rows. This contains just the hash table directory +/// structure but not the rows themselves. This is NULL for spilled partitions when +/// we stop maintaining the hash table. +/// 2) MemPool for var-len result data for rows in the hash table. If the aggregate +/// function returns a string, we cannot append it to the tuple stream as that +/// structure is immutable. Instead, when we need to spill, we sweep and copy the +/// rows into a tuple stream. +/// 3) Aggregated tuple stream for rows that are/were in the hash table. This stream +/// contains rows that are aggregated. When the partition is not spilled, this stream +/// is pinned and contains the memory referenced by the hash table. +/// In the case where the aggregate function does not return a string (meaning the +/// size of all the slots is known when the row is constructed), this stream contains +/// all the memory for the result rows and the MemPool (2) is not used. +/// 4) Unaggregated tuple stream. Stream to spill unaggregated rows. +/// Rows in this stream always have child(0)'s layout. +/// +/// Buffering: Each stream and hash table needs to maintain at least one buffer for +/// some duration of the processing. To minimize the memory requirements of small queries +/// (i.e. memory usage is less than one IO-buffer per partition), the streams and hash +/// tables of each partition start using small (less than IO-sized) buffers, regardless +/// of the level. +/// +/// Two-phase aggregation: we support two-phase distributed aggregations, where +/// pre-aggregrations attempt to reduce the size of data before shuffling data across the +/// network to be merged by the merge aggregation node. This exec node supports a +/// streaming mode for pre-aggregations where it maintains a hash table of aggregated +/// rows, but can pass through unaggregated rows (after transforming them into the +/// same tuple format as aggregated rows) when a heuristic determines that it is better +/// to send rows across the network instead of consuming additional memory and CPU +/// resources to expand its hash table. The planner decides whether a given +/// pre-aggregation should use the streaming preaggregation algorithm or the same +/// blocking aggregation algorithm as used in merge aggregations. +/// TODO: make this less of a heuristic by factoring in the cost of the exchange vs the +/// cost of the pre-aggregation. +/// +/// If there are no grouping expressions, there is only a single output row for both +/// preaggregations and merge aggregations. This case is handled separately to avoid +/// building hash tables. There is also no need to do streaming preaggregations. +/// +/// Handling memory pressure: the node uses two different strategies for responding to +/// memory pressure, depending on whether it is a streaming pre-aggregation or not. If +/// the node is a streaming preaggregation, it stops growing its hash table further by +/// converting unaggregated rows into the aggregated tuple format and passing them +/// through. If the node is not a streaming pre-aggregation, it responds to memory +/// pressure by spilling partitions to disk. +/// +/// TODO: Buffer rows before probing into the hash table? +/// TODO: After spilling, we can still maintain a very small hash table just to remove +/// some number of rows (from likely going to disk). +/// TODO: Consider allowing to spill the hash table structure in addition to the rows. +/// TODO: Do we want to insert a buffer before probing into the partition's hash table? +/// TODO: Use a prefetch/batched probe interface. +/// TODO: Return rows from the aggregated_row_stream rather than the HT. +/// TODO: Think about spilling heuristic. +/// TODO: When processing a spilled partition, we have a lot more information and can +/// size the partitions/hash tables better. +/// TODO: Start with unpartitioned (single partition) and switch to partitioning and +/// spilling only if the size gets large, say larger than the LLC. +/// TODO: Simplify or cleanup the various uses of agg_fn_ctx, agg_fn_ctx_, and ctx. +/// There are so many contexts in use that a plain "ctx" variable should never be used. +/// Likewise, it's easy to mixup the agg fn ctxs, there should be a way to simplify this. +/// TODO: support an Init() method with an initial value in the UDAF interface. +class NewPartitionedAggregationNode : public ExecNode { + public: + + NewPartitionedAggregationNode(ObjectPool* pool, + const TPlanNode& tnode, const DescriptorTbl& descs); + + virtual Status init(const TPlanNode& tnode, RuntimeState* state); + virtual Status prepare(RuntimeState* state); +// virtual void Codegen(RuntimeState* state); + virtual Status open(RuntimeState* state); + virtual Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos); + virtual Status reset(RuntimeState* state); + virtual Status close(RuntimeState* state); + + static const char* LLVM_CLASS_NAME; + + protected: + /// Frees local allocations from aggregate_evals_ and agg_fn_evals +// virtual Status QueryMaintenance(RuntimeState* state); + virtual std::string DebugString(int indentation_level) const; + virtual void DebugString(int indentation_level, std::stringstream* out) const; + + private: + struct Partition; + + /// Number of initial partitions to create. Must be a power of 2. + static const int PARTITION_FANOUT = 16; + + /// Needs to be the log(PARTITION_FANOUT). + /// We use the upper bits to pick the partition and lower bits in the HT. + /// TODO: different hash functions here too? We don't need that many bits to pick + /// the partition so this might be okay. + static const int NUM_PARTITIONING_BITS = 4; + + /// Maximum number of times we will repartition. The maximum build table we can process + /// (if we have enough scratch disk space) in case there is no skew is: + /// MEM_LIMIT * (PARTITION_FANOUT ^ MAX_PARTITION_DEPTH). + /// In the case where there is skew, repartitioning is unlikely to help (assuming a + /// reasonable hash function). + /// Note that we need to have at least as many SEED_PRIMES in NewPartitionedHashTableCtx. + /// TODO: we can revisit and try harder to explicitly detect skew. + static const int MAX_PARTITION_DEPTH = 16; + + /// Default initial number of buckets in a hash table. + /// TODO: rethink this ? + static const int64_t PAGG_DEFAULT_HASH_TABLE_SZ = 1024; + + /// Codegen doesn't allow for automatic Status variables because then exception + /// handling code is needed to destruct the Status, and our function call substitution + /// doesn't know how to deal with the LLVM IR 'invoke' instruction. Workaround that by + /// placing the Status here so exceptions won't need to destruct it. + /// TODO: fix IMPALA-1948 and remove this. + Status process_batch_status_; + + /// Tuple into which Update()/Merge()/Serialize() results are stored. + TupleId intermediate_tuple_id_; + TupleDescriptor* intermediate_tuple_desc_; + + /// Row with the intermediate tuple as its only tuple. + /// Construct a new row desc for preparing the build exprs because neither the child's + /// nor this node's output row desc may contain the intermediate tuple, e.g., + /// in a single-node plan with an intermediate tuple different from the output tuple. + /// Lives in the query state's obj_pool. + RowDescriptor intermediate_row_desc_; + + /// Tuple into which Finalize() results are stored. Possibly the same as + /// the intermediate tuple. + TupleId output_tuple_id_; + TupleDescriptor* output_tuple_desc_; + + /// Certain aggregates require a finalize step, which is the final step of the + /// aggregate after consuming all input rows. The finalize step converts the aggregate + /// value into its final form. This is true if this node contains aggregate that + /// requires a finalize step. + const bool needs_finalize_; + + /// True if this is first phase of a two-phase distributed aggregation for which we + /// are doing a streaming preaggregation. + bool is_streaming_preagg_; + + /// True if any of the evaluators require the serialize step. + bool needs_serialize_; + + /// The list of all aggregate operations for this exec node. + std::vector agg_fns_; + + /// Evaluators for each aggregate function. If this is a grouping aggregation, these + /// evaluators are only used to create cloned per-partition evaluators. The cloned + /// evaluators are then used to evaluate the functions. If this is a non-grouping + /// aggregation these evaluators are used directly to evaluate the functions. + /// + /// Permanent and result allocations for these allocators are allocated from + /// 'expr_perm_pool_' and 'expr_results_pool_' respectively. + std::vector agg_fn_evals_; + boost::scoped_ptr agg_fn_pool_; + + /// Exprs used to evaluate input rows + std::vector grouping_exprs_; + + /// Exprs used to insert constructed aggregation tuple into the hash table. + /// All the exprs are simply SlotRefs for the intermediate tuple. + std::vector build_exprs_; + + /// Exprs used to evaluate input rows + /// TODO (pengyubing) Is this variable useful? + std::vector grouping_expr_ctxs_; + + /// Indices of grouping exprs with var-len string types in grouping_expr_ctxs_. We need + /// to do more work for var-len expressions when allocating and spilling rows. All + /// var-len grouping exprs have type string. + std::vector string_grouping_exprs_; + + RuntimeState* state_; + /// Allocator for hash table memory. + boost::scoped_ptr ht_allocator_; + /// MemPool used to allocate memory for when we don't have grouping and don't initialize + /// the partitioning structures, or during Close() when creating new output tuples. + /// For non-grouping aggregations, the ownership of the pool's memory is transferred + /// to the output batch on eos. The pool should not be Reset() to allow amortizing + /// memory allocation over a series of Reset()/Open()/GetNext()* calls. + boost::scoped_ptr mem_pool_; + + /// The current partition and iterator to the next row in its hash table that we need + /// to return in GetNext() + Partition* output_partition_; + NewPartitionedHashTable::Iterator output_iterator_; + + typedef Status (*ProcessBatchNoGroupingFn)(NewPartitionedAggregationNode*, RowBatch*); + /// Jitted ProcessBatchNoGrouping function pointer. Null if codegen is disabled. + ProcessBatchNoGroupingFn process_batch_no_grouping_fn_; + + typedef Status (*ProcessBatchFn)( + NewPartitionedAggregationNode*, RowBatch*, NewPartitionedHashTableCtx*); + /// Jitted ProcessBatch function pointer. Null if codegen is disabled. + ProcessBatchFn process_batch_fn_; + + typedef Status (*ProcessBatchStreamingFn)(NewPartitionedAggregationNode*, bool, + RowBatch*, RowBatch*, NewPartitionedHashTableCtx*, int[PARTITION_FANOUT]); + /// Jitted ProcessBatchStreaming function pointer. Null if codegen is disabled. + ProcessBatchStreamingFn process_batch_streaming_fn_; + + /// Time spent processing the child rows + RuntimeProfile::Counter* build_timer_; + + /// Total time spent resizing hash tables. + RuntimeProfile::Counter* ht_resize_timer_; + + /// Time spent returning the aggregated rows + RuntimeProfile::Counter* get_results_timer_; + + /// Total number of hash buckets across all partitions. + RuntimeProfile::Counter* num_hash_buckets_; + + /// Total number of partitions created. + RuntimeProfile::Counter* partitions_created_; + + /// Level of max partition (i.e. number of repartitioning steps). + RuntimeProfile::HighWaterMarkCounter* max_partition_level_; + + /// Number of rows that have been repartitioned. + RuntimeProfile::Counter* num_row_repartitioned_; + + /// Number of partitions that have been repartitioned. + RuntimeProfile::Counter* num_repartitions_; + + /// Number of partitions that have been spilled. + RuntimeProfile::Counter* num_spilled_partitions_; + + /// The largest fraction after repartitioning. This is expected to be + /// 1 / PARTITION_FANOUT. A value much larger indicates skew. + RuntimeProfile::HighWaterMarkCounter* largest_partition_percent_; + + /// Time spent in streaming preagg algorithm. + RuntimeProfile::Counter* streaming_timer_; + + /// The number of rows passed through without aggregation. + RuntimeProfile::Counter* num_passthrough_rows_; + + /// The estimated reduction of the preaggregation. + RuntimeProfile::Counter* preagg_estimated_reduction_; + + /// Expose the minimum reduction factor to continue growing the hash tables. + RuntimeProfile::Counter* preagg_streaming_ht_min_reduction_; + + /// The estimated number of input rows from the planner. + int64_t estimated_input_cardinality_; + + ///////////////////////////////////////// + /// BEGIN: Members that must be Reset() + + /// Result of aggregation w/o GROUP BY. + /// Note: can be NULL even if there is no grouping if the result tuple is 0 width + /// e.g. select 1 from table group by col. + Tuple* singleton_output_tuple_; + bool singleton_output_tuple_returned_; + + /// Row batch used as argument to GetNext() for the child node preaggregations. Store + /// in node to avoid reallocating for every GetNext() call when streaming. + boost::scoped_ptr child_batch_; + + /// If true, no more rows to output from partitions. + bool partition_eos_; + + /// True if no more rows to process from child. + bool child_eos_; + + /// Used for hash-related functionality, such as evaluating rows and calculating hashes. + /// It also owns the evaluators for the grouping and build expressions used during hash + /// table insertion and probing. + boost::scoped_ptr ht_ctx_; + + /// Object pool that holds the Partition objects in hash_partitions_. + boost::scoped_ptr partition_pool_; + + /// Current partitions we are partitioning into. IMPALA-5788: For the case where we + /// rebuild a spilled partition that fits in memory, all pointers in this vector will + /// point to a single in-memory partition. + std::vector hash_partitions_; + + /// Cache for hash tables in 'hash_partitions_'. IMPALA-5788: For the case where we + /// rebuild a spilled partition that fits in memory, all pointers in this array will + /// point to the hash table that is a part of a single in-memory partition. + NewPartitionedHashTable* hash_tbls_[PARTITION_FANOUT]; + + /// All partitions that have been spilled and need further processing. + std::deque spilled_partitions_; + + /// All partitions that are aggregated and can just return the results in GetNext(). + /// After consuming all the input, hash_partitions_ is split into spilled_partitions_ + /// and aggregated_partitions_, depending on if it was spilled or not. + std::deque aggregated_partitions_; + + /// END: Members that must be Reset() + ///////////////////////////////////////// + + /// The hash table and streams (aggregated and unaggregated) for an individual + /// partition. The streams of each partition always (i.e. regardless of level) + /// initially use small buffers. Streaming pre-aggregations do not spill and do not + /// require an unaggregated stream. + struct Partition { + Partition(NewPartitionedAggregationNode* parent, int level, int idx) + : parent(parent), is_closed(false), level(level), idx(idx) {} + + ~Partition(); + + /// Initializes aggregated_row_stream and unaggregated_row_stream (if a spilling + /// aggregation), allocating one buffer for each. Spilling merge aggregations must + /// have enough reservation for the initial buffer for the stream, so this should + /// not fail due to OOM. Preaggregations do not reserve any buffers: if does not + /// have enough reservation for the initial buffer, the aggregated row stream is not + /// created and an OK status is returned. + Status InitStreams(); + + /// Initializes the hash table. 'aggregated_row_stream' must be non-NULL. + /// Sets 'got_memory' to true if the hash table was initialised or false on OOM. + Status InitHashTable(bool* got_memory); + + /// Called in case we need to serialize aggregated rows. This step effectively does + /// a merge aggregation in this node. + Status SerializeStreamForSpilling(); + + /// Closes this partition. If finalize_rows is true, this iterates over all rows + /// in aggregated_row_stream and finalizes them (this is only used in the cancellation + /// path). + void Close(bool finalize_rows); + + /// Spill this partition. 'more_aggregate_rows' = true means that more aggregate rows + /// may be appended to the the partition before appending unaggregated rows. On + /// success, one of the streams is left with a write iterator: the aggregated stream + /// if 'more_aggregate_rows' is true or the unaggregated stream otherwise. + Status Spill(bool more_aggregate_rows); + + bool is_spilled() const { return hash_tbl.get() == NULL; } + + NewPartitionedAggregationNode* parent; + + /// If true, this partition is closed and there is nothing left to do. + bool is_closed; + + /// How many times rows in this partition have been repartitioned. Partitions created + /// from the node's children's input is level 0, 1 after the first repartitionining, + /// etc. + const int level; + + /// The index of this partition within 'hash_partitions_' at its level. + const int idx; + + /// Hash table for this partition. + /// Can be NULL if this partition is no longer maintaining a hash table (i.e. + /// is spilled or we are passing through all rows for this partition). + boost::scoped_ptr hash_tbl; + + /// Clone of parent's agg_fn_evals_. Permanent allocations come from + /// 'agg_fn_perm_pool' and result allocations come from the ExecNode's + /// 'expr_results_pool_'. + std::vector agg_fn_evals; + boost::scoped_ptr agg_fn_pool; + + /// Tuple stream used to store aggregated rows. When the partition is not spilled, + /// (meaning the hash table is maintained), this stream is pinned and contains the + /// memory referenced by the hash table. When it is spilled, this consumes reservation + /// for a write buffer only during repartitioning of aggregated rows. + /// + /// For streaming preaggs, this may be NULL if sufficient memory is not available. + /// In that case hash_tbl is also NULL and all rows for the partition will be passed + /// through. + boost::scoped_ptr aggregated_row_stream; + + /// Unaggregated rows that are spilled. Always NULL for streaming pre-aggregations. + /// Always unpinned. Has a write buffer allocated when the partition is spilled and + /// unaggregated rows are being processed. + boost::scoped_ptr unaggregated_row_stream; + }; + + /// Stream used to store serialized spilled rows. Only used if needs_serialize_ + /// is set. This stream is never pinned and only used in Partition::Spill as a + /// a temporary buffer. + boost::scoped_ptr serialize_stream_; + + /// Accessor for 'hash_tbls_' that verifies consistency with the partitions. + NewPartitionedHashTable* ALWAYS_INLINE GetHashTable(int partition_idx) { + NewPartitionedHashTable* ht = hash_tbls_[partition_idx]; + DCHECK_EQ(ht, hash_partitions_[partition_idx]->hash_tbl.get()); + return ht; + } + + /// Materializes 'row_batch' in either grouping or non-grouping case. + Status GetNextInternal(RuntimeState* state, RowBatch* row_batch, bool* eos); + + /// Helper function called by GetNextInternal() to ensure that string data referenced in + /// 'row_batch' will live as long as 'row_batch's tuples. 'first_row_idx' indexes the + /// first row that should be processed in 'row_batch'. + Status HandleOutputStrings(RowBatch* row_batch, int first_row_idx); + + /// Copies string data from the specified slot into 'pool', and sets the StringValues' + /// ptrs to the copied data. Copies data from all tuples in 'row_batch' from + /// 'first_row_idx' onwards. 'slot_desc' must have a var-len string type. + Status CopyStringData(const SlotDescriptor& slot_desc, RowBatch* row_batch, + int first_row_idx, MemPool* pool); + + /// Constructs singleton output tuple, allocating memory from pool. + Tuple* ConstructSingletonOutputTuple( + const std::vector& agg_fn_evals, MemPool* pool); + + /// Copies grouping values stored in 'ht_ctx_' that were computed over 'current_row_' + /// using 'grouping_expr_evals_'. Aggregation expr slots are set to their initial + /// values. Returns NULL if there was not enough memory to allocate the tuple or errors + /// occurred. In which case, 'status' is set. Allocates tuple and var-len data for + /// grouping exprs from stream. Var-len data for aggregate exprs is allocated from the + /// FunctionContexts, so is stored outside the stream. If stream's small buffers get + /// full, it will attempt to switch to IO-buffers. + Tuple* ConstructIntermediateTuple(const std::vector& agg_fn_evals, + BufferedTupleStream3* stream, Status* status); + + /// Constructs intermediate tuple, allocating memory from pool instead of the stream. + /// Returns NULL and sets status if there is not enough memory to allocate the tuple. + Tuple* ConstructIntermediateTuple(const std::vector& agg_fn_evals, + MemPool* pool, Status* status); + + /// Returns the number of bytes of variable-length data for the grouping values stored + /// in 'ht_ctx_'. + int GroupingExprsVarlenSize(); + + /// Initializes intermediate tuple by copying grouping values stored in 'ht_ctx_' that + /// that were computed over 'current_row_' using 'grouping_expr_evals_'. Writes the + /// var-len data into buffer. 'buffer' points to the start of a buffer of at least the + /// size of the variable-length data: 'varlen_size'. + void CopyGroupingValues(Tuple* intermediate_tuple, uint8_t* buffer, int varlen_size); + + /// Initializes the aggregate function slots of an intermediate tuple. + /// Any var-len data is allocated from the FunctionContexts. + void InitAggSlots(const std::vector& agg_fn_evals, + Tuple* intermediate_tuple); + + /// Updates the given aggregation intermediate tuple with aggregation values computed + /// over 'row' using 'agg_fn_evals'. Whether the agg fn evaluator calls Update() or + /// Merge() is controlled by the evaluator itself, unless enforced explicitly by passing + /// in is_merge == true. The override is needed to merge spilled and non-spilled rows + /// belonging to the same partition independent of whether the agg fn evaluators have + /// is_merge() == true. + /// This function is replaced by codegen (which is why we don't use a vector argument + /// for agg_fn_evals).. Any var-len data is allocated from the FunctionContexts. + void UpdateTuple(NewAggFnEvaluator** agg_fn_evals, Tuple* tuple, TupleRow* row, + bool is_merge = false); + + /// Called on the intermediate tuple of each group after all input rows have been + /// consumed and aggregated. Computes the final aggregate values to be returned in + /// GetNext() using the agg fn evaluators' Serialize() or Finalize(). + /// For the Finalize() case if the output tuple is different from the intermediate + /// tuple, then a new tuple is allocated from 'pool' to hold the final result. + /// Grouping values are copied into the output tuple and the the output tuple holding + /// the finalized/serialized aggregate values is returned. + /// TODO: Coordinate the allocation of new tuples with the release of memory + /// so as not to make memory consumption blow up. + Tuple* GetOutputTuple(const std::vector& agg_fn_evals, + Tuple* tuple, MemPool* pool); + + /// Do the aggregation for all tuple rows in the batch when there is no grouping. + /// This function is replaced by codegen. + Status ProcessBatchNoGrouping(RowBatch* batch); + + /// Processes a batch of rows. This is the core function of the algorithm. We partition + /// the rows into hash_partitions_, spilling as necessary. + /// If AGGREGATED_ROWS is true, it means that the rows in the batch are already + /// pre-aggregated. + /// 'prefetch_mode' specifies the prefetching mode in use. If it's not PREFETCH_NONE, + /// hash table buckets will be prefetched based on the hash values computed. Note + /// that 'prefetch_mode' will be substituted with constants during codegen time. + // + /// This function is replaced by codegen. We pass in ht_ctx_.get() as an argument for + /// performance. + template + Status IR_ALWAYS_INLINE ProcessBatch(RowBatch* batch, NewPartitionedHashTableCtx* ht_ctx); + + /// Evaluates the rows in 'batch' starting at 'start_row_idx' and stores the results in + /// the expression values cache in 'ht_ctx'. The number of rows evaluated depends on + /// the capacity of the cache. 'prefetch_mode' specifies the prefetching mode in use. + /// If it's not PREFETCH_NONE, hash table buckets for the computed hashes will be + /// prefetched. Note that codegen replaces 'prefetch_mode' with a constant. + template + void EvalAndHashPrefetchGroup(RowBatch* batch, int start_row_idx, NewPartitionedHashTableCtx* ht_ctx); + + /// This function processes each individual row in ProcessBatch(). Must be inlined into + /// ProcessBatch for codegen to substitute function calls with codegen'd versions. + /// May spill partitions if not enough memory is available. + template + Status IR_ALWAYS_INLINE ProcessRow(TupleRow* row, NewPartitionedHashTableCtx* ht_ctx); + + /// Create a new intermediate tuple in partition, initialized with row. ht_ctx is + /// the context for the partition's hash table and hash is the precomputed hash of + /// the row. The row can be an unaggregated or aggregated row depending on + /// AGGREGATED_ROWS. Spills partitions if necessary to append the new intermediate + /// tuple to the partition's stream. Must be inlined into ProcessBatch for codegen + /// to substitute function calls with codegen'd versions. insert_it is an iterator + /// for insertion returned from NewPartitionedHashTable::FindBuildRowBucket(). + template + Status IR_ALWAYS_INLINE AddIntermediateTuple(Partition* partition, + TupleRow* row, uint32_t hash, NewPartitionedHashTable::Iterator insert_it); + + /// Append a row to a spilled partition. May spill partitions if needed to switch to + /// I/O buffers. Selects the correct stream according to the argument. Inlined into + /// ProcessBatch(). + template + Status IR_ALWAYS_INLINE AppendSpilledRow(Partition* partition, TupleRow* row); + + /// Reads all the rows from input_stream and process them by calling ProcessBatch(). + template + Status ProcessStream(BufferedTupleStream3* input_stream); + + /// Output 'singleton_output_tuple_' and transfer memory to 'row_batch'. + void GetSingletonOutput(RowBatch* row_batch); + + /// Get rows for the next rowbatch from the next partition. Sets 'partition_eos_' to + /// true if all rows from all partitions have been returned or the limit is reached. + Status GetRowsFromPartition(RuntimeState* state, RowBatch* row_batch); + + /// Get output rows from child for streaming pre-aggregation. Aggregates some rows with + /// hash table and passes through other rows converted into the intermediate + /// tuple format. Sets 'child_eos_' once all rows from child have been returned. + Status GetRowsStreaming(RuntimeState* state, RowBatch* row_batch); + + /// Return true if we should keep expanding hash tables in the preagg. If false, + /// the preagg should pass through any rows it can't fit in its tables. + bool ShouldExpandPreaggHashTables() const; + + /// Streaming processing of in_batch from child. Rows from child are either aggregated + /// into the hash table or added to 'out_batch' in the intermediate tuple format. + /// 'in_batch' is processed entirely, and 'out_batch' must have enough capacity to + /// store all of the rows in 'in_batch'. + /// 'needs_serialize' is an argument so that codegen can replace it with a constant, + /// rather than using the member variable 'needs_serialize_'. + /// 'prefetch_mode' specifies the prefetching mode in use. If it's not PREFETCH_NONE, + /// hash table buckets will be prefetched based on the hash values computed. Note + /// that 'prefetch_mode' will be substituted with constants during codegen time. + /// 'remaining_capacity' is an array with PARTITION_FANOUT entries with the number of + /// additional rows that can be added to the hash table per partition. It is updated + /// by ProcessBatchStreaming() when it inserts new rows. + /// 'ht_ctx' is passed in as a way to avoid aliasing of 'this' confusing the optimiser. + Status ProcessBatchStreaming(bool needs_serialize, + RowBatch* in_batch, RowBatch* out_batch, NewPartitionedHashTableCtx* ht_ctx, + int remaining_capacity[PARTITION_FANOUT]); + + /// Tries to add intermediate to the hash table 'hash_tbl' of 'partition' for streaming + /// aggregation. The input row must have been evaluated with 'ht_ctx', with 'hash' set + /// to the corresponding hash. If the tuple already exists in the hash table, update + /// the tuple and return true. Otherwise try to create a new entry in the hash table, + /// returning true if successful or false if the table is full. 'remaining_capacity' + /// keeps track of how many more entries can be added to the hash table so we can avoid + /// retrying inserts. It is decremented if an insert succeeds and set to zero if an + /// insert fails. If an error occurs, returns false and sets 'status'. + bool IR_ALWAYS_INLINE TryAddToHashTable(NewPartitionedHashTableCtx* ht_ctx, + Partition* partition, NewPartitionedHashTable* hash_tbl, TupleRow* in_row, uint32_t hash, + int* remaining_capacity, Status* status); + + /// Initializes hash_partitions_. 'level' is the level for the partitions to create. + /// If 'single_partition_idx' is provided, it must be a number in range + /// [0, PARTITION_FANOUT), and only that partition is created - all others point to it. + /// Also sets ht_ctx_'s level to 'level'. + Status CreateHashPartitions(int level, int single_partition_idx = -1); + + /// Ensure that hash tables for all in-memory partitions are large enough to fit + /// 'num_rows' additional hash table entries. If there is not enough memory to + /// resize the hash tables, may spill partitions. 'aggregated_rows' is true if + /// we're currently partitioning aggregated rows. + Status CheckAndResizeHashPartitions(bool aggregated_rows, int num_rows, const NewPartitionedHashTableCtx* ht_ctx); + + /// Prepares the next partition to return results from. On return, this function + /// initializes output_iterator_ and output_partition_. This either removes + /// a partition from aggregated_partitions_ (and is done) or removes the next + /// partition from aggregated_partitions_ and repartitions it. + Status NextPartition(); + + /// Tries to build the first partition in 'spilled_partitions_'. + /// If successful, set *built_partition to the partition. The caller owns the partition + /// and is responsible for closing it. If unsuccessful because the partition could not + /// fit in memory, set *built_partition to NULL and append the spilled partition to the + /// head of 'spilled_partitions_' so it can be processed by + /// RepartitionSpilledPartition(). + Status BuildSpilledPartition(Partition** built_partition); + + /// Repartitions the first partition in 'spilled_partitions_' into PARTITION_FANOUT + /// output partitions. On success, each output partition is either: + /// * closed, if no rows were added to the partition. + /// * in 'spilled_partitions_', if the partition spilled. + /// * in 'aggregated_partitions_', if the output partition was not spilled. + Status RepartitionSpilledPartition(); + + /// Picks a partition from 'hash_partitions_' to spill. 'more_aggregate_rows' is passed + /// to Partition::Spill() when spilling the partition. See the Partition::Spill() + /// comment for further explanation. + Status SpillPartition(bool more_aggregate_rows); + + /// Moves the partitions in hash_partitions_ to aggregated_partitions_ or + /// spilled_partitions_. Partitions moved to spilled_partitions_ are unpinned. + /// input_rows is the number of input rows that have been repartitioned. + /// Used for diagnostics. + Status MoveHashPartitions(int64_t input_rows); + + /// Adds a partition to the front of 'spilled_partitions_' for later processing. + /// 'spilled_partitions_' uses LIFO so more finely partitioned partitions are processed + /// first). This allows us to delete pages earlier and bottom out the recursion + /// earlier and also improves time locality of access to spilled data on disk. + void PushSpilledPartition(Partition* partition); + + /// Calls Close() on every Partition in 'aggregated_partitions_', + /// 'spilled_partitions_', and 'hash_partitions_' and then resets the lists, + /// the vector and the partition pool. + void ClosePartitions(); + + /// Calls finalizes on all tuples starting at 'it'. + void CleanupHashTbl(const std::vector& agg_fn_evals, + NewPartitionedHashTable::Iterator it); + + /// Codegen UpdateSlot(). Returns non-OK status if codegen is unsuccessful. + /// Assumes is_merge = false; +// Status CodegenUpdateSlot(LlvmCodeGen* codegen, NewAggFnEvaluator* evaluator, +// int evaluator_idx, SlotDescriptor* slot_desc, llvm::Function** fn); + + /// Codegen a call to a function implementing the UDA interface with input values + /// from 'input_vals'. 'dst_val' should contain the previous value of the aggregate + /// function, and 'updated_dst_val' is set to the new value after the Update or Merge + /// operation is applied. The instruction sequence for the UDA call is inserted at + /// the insert position of 'builder'. +// Status CodegenCallUda(LlvmCodeGen* codegen, LlvmBuilder* builder, +// NewAggFnEvaluator* evaluator, llvm::Value* agg_fn_ctx_arg, +// const std::vector& input_vals, const CodegenAnyVal& dst_val, +// CodegenAnyVal* updated_dst_val); + + /// Codegen UpdateTuple(). Returns non-OK status if codegen is unsuccessful. +// Status CodegenUpdateTuple(LlvmCodeGen* codegen, llvm::Function** fn); + + /// Codegen the non-streaming process row batch loop. The loop has already been + /// compiled to IR and loaded into the codegen object. UpdateAggTuple has also been + /// codegen'd to IR. This function will modify the loop subsituting the statically + /// compiled functions with codegen'd ones. 'process_batch_fn_' or + /// 'process_batch_no_grouping_fn_' will be updated with the codegened function + /// depending on whether this is a grouping or non-grouping aggregation. + /// Assumes AGGREGATED_ROWS = false. +// Status CodegenProcessBatch(LlvmCodeGen* codegen); + + /// Codegen the materialization loop for streaming preaggregations. + /// 'process_batch_streaming_fn_' will be updated with the codegened function. +// Status CodegenProcessBatchStreaming(LlvmCodeGen* codegen); + + /// Compute minimum buffer reservation for grouping aggregations. + /// We need one buffer per partition, which is used either as the write buffer for the + /// aggregated stream or the unaggregated stream. We need an additional buffer to read + /// the stream we are currently repartitioning. The read buffer needs to be a max-sized + /// buffer to hold a max-sized row and we need one max-sized write buffer that is used + /// temporarily to append a row to any stream. + /// + /// If we need to serialize, we need an additional buffer while spilling a partition + /// as the partitions aggregate stream needs to be serialized and rewritten. + /// We do not spill streaming preaggregations, so we do not need to reserve any buffers. + int64_t MinReservation() const { + //DCHECK(!grouping_exprs_.empty()); + // Must be kept in sync with AggregationNode.computeNodeResourceProfile() in fe. + //if (is_streaming_preagg_) { + // Reserve at least one buffer and a 64kb hash table per partition. + // return (_resource_profile.spillable_buffer_size + 64 * 1024) * PARTITION_FANOUT; + //} + //int num_buffers = PARTITION_FANOUT + 1 + (needs_serialize_ ? 1 : 0); + // Two of the buffers must fit the maximum row. + //return _resource_profile.spillable_buffer_size * (num_buffers - 2) + + //_resource_profile.max_row_buffer_size * 2; + return 0; + } +}; + +} + +#endif + diff --git a/be/src/exec/new_partitioned_aggregation_node_ir.cc b/be/src/exec/new_partitioned_aggregation_node_ir.cc new file mode 100644 index 0000000000..da3ff63ca3 --- /dev/null +++ b/be/src/exec/new_partitioned_aggregation_node_ir.cc @@ -0,0 +1,254 @@ +// Modifications copyright (C) 2017, Baidu.com, Inc. +// Copyright 2017 The Apache Software Foundation + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "exec/new_partitioned_aggregation_node.h" + +#include "exec/new_partitioned_hash_table.inline.h" +#include "exprs/new_agg_fn_evaluator.h" +#include "exprs/expr_context.h" +#include "runtime/buffered_tuple_stream3.inline.h" +#include "runtime/row_batch.h" +#include "runtime/tuple_row.h" + +using namespace palo; + +Status NewPartitionedAggregationNode::ProcessBatchNoGrouping(RowBatch* batch) { + Tuple* output_tuple = singleton_output_tuple_; + FOREACH_ROW(batch, 0, batch_iter) { + UpdateTuple(agg_fn_evals_.data(), output_tuple, batch_iter.get()); + } + return Status::OK; +} + +template +Status NewPartitionedAggregationNode::ProcessBatch(RowBatch* batch, + NewPartitionedHashTableCtx* ht_ctx) { + DCHECK(!hash_partitions_.empty()); + DCHECK(!is_streaming_preagg_); + + // Make sure that no resizes will happen when inserting individual rows to the hash + // table of each partition by pessimistically assuming that all the rows in each batch + // will end up to the same partition. + // TODO: Once we have a histogram with the number of rows per partition, we will have + // accurate resize calls. + RETURN_IF_ERROR(CheckAndResizeHashPartitions(AGGREGATED_ROWS, batch->num_rows(), ht_ctx)); + + NewPartitionedHashTableCtx::ExprValuesCache* expr_vals_cache = ht_ctx->expr_values_cache(); + const int cache_size = expr_vals_cache->capacity(); + const int num_rows = batch->num_rows(); + for (int group_start = 0; group_start < num_rows; group_start += cache_size) { + EvalAndHashPrefetchGroup(batch, group_start, ht_ctx); + + FOREACH_ROW_LIMIT(batch, group_start, cache_size, batch_iter) { + RETURN_IF_ERROR(ProcessRow(batch_iter.get(), ht_ctx)); + expr_vals_cache->NextRow(); + } + DCHECK(expr_vals_cache->AtEnd()); + } + return Status::OK; +} + +template +void IR_ALWAYS_INLINE NewPartitionedAggregationNode::EvalAndHashPrefetchGroup( + RowBatch* batch, int start_row_idx, + NewPartitionedHashTableCtx* ht_ctx) { + NewPartitionedHashTableCtx::ExprValuesCache* expr_vals_cache = ht_ctx->expr_values_cache(); + const int cache_size = expr_vals_cache->capacity(); + + expr_vals_cache->Reset(); + FOREACH_ROW_LIMIT(batch, start_row_idx, cache_size, batch_iter) { + TupleRow* row = batch_iter.get(); + bool is_null; + if (AGGREGATED_ROWS) { + is_null = !ht_ctx->EvalAndHashBuild(row); + } else { + is_null = !ht_ctx->EvalAndHashProbe(row); + } + // Hoist lookups out of non-null branch to speed up non-null case. + const uint32_t hash = expr_vals_cache->CurExprValuesHash(); + const uint32_t partition_idx = hash >> (32 - NUM_PARTITIONING_BITS); + NewPartitionedHashTable* hash_tbl = GetHashTable(partition_idx); + if (is_null) { + expr_vals_cache->SetRowNull(); + } else if (config::enable_prefetch) { + if (LIKELY(hash_tbl != NULL)) hash_tbl->PrefetchBucket(hash); + } + expr_vals_cache->NextRow(); + } + + expr_vals_cache->ResetForRead(); +} + +template +Status NewPartitionedAggregationNode::ProcessRow(TupleRow* row, + NewPartitionedHashTableCtx* ht_ctx) { + NewPartitionedHashTableCtx::ExprValuesCache* expr_vals_cache = ht_ctx->expr_values_cache(); + // Hoist lookups out of non-null branch to speed up non-null case. + const uint32_t hash = expr_vals_cache->CurExprValuesHash(); + const uint32_t partition_idx = hash >> (32 - NUM_PARTITIONING_BITS); + if (expr_vals_cache->IsRowNull()) return Status::OK; + // To process this row, we first see if it can be aggregated or inserted into this + // partition's hash table. If we need to insert it and that fails, due to OOM, we + // spill the partition. The partition to spill is not necessarily dst_partition, + // so we can try again to insert the row. + NewPartitionedHashTable* hash_tbl = GetHashTable(partition_idx); + Partition* dst_partition = hash_partitions_[partition_idx]; + DCHECK(dst_partition != nullptr); + DCHECK_EQ(dst_partition->is_spilled(), hash_tbl == NULL); + if (hash_tbl == NULL) { + // This partition is already spilled, just append the row. + return AppendSpilledRow(dst_partition, row); + } + + DCHECK(dst_partition->aggregated_row_stream->is_pinned()); + bool found; + // Find the appropriate bucket in the hash table. There will always be a free + // bucket because we checked the size above. + NewPartitionedHashTable::Iterator it = hash_tbl->FindBuildRowBucket(ht_ctx, &found); + DCHECK(!it.AtEnd()) << "Hash table had no free buckets"; + if (AGGREGATED_ROWS) { + // If the row is already an aggregate row, it cannot match anything in the + // hash table since we process the aggregate rows first. These rows should + // have been aggregated in the initial pass. + DCHECK(!found); + } else if (found) { + // Row is already in hash table. Do the aggregation and we're done. + UpdateTuple(dst_partition->agg_fn_evals.data(), it.GetTuple(), row); + return Status::OK; + } + + // If we are seeing this result row for the first time, we need to construct the + // result row and initialize it. + return AddIntermediateTuple(dst_partition, row, hash, it); +} + +template +Status NewPartitionedAggregationNode::AddIntermediateTuple(Partition* partition, + TupleRow* row, uint32_t hash, NewPartitionedHashTable::Iterator insert_it) { + while (true) { + DCHECK(partition->aggregated_row_stream->is_pinned()); + Tuple* intermediate_tuple = ConstructIntermediateTuple(partition->agg_fn_evals, + partition->aggregated_row_stream.get(), &process_batch_status_); + + if (LIKELY(intermediate_tuple != NULL)) { + UpdateTuple(partition->agg_fn_evals.data(), intermediate_tuple, row, AGGREGATED_ROWS); + // After copying and initializing the tuple, insert it into the hash table. + insert_it.SetTuple(intermediate_tuple, hash); + return Status::OK; + } else if (!process_batch_status_.ok()) { + return std::move(process_batch_status_); + } + + // We did not have enough memory to add intermediate_tuple to the stream. + RETURN_IF_ERROR(SpillPartition(AGGREGATED_ROWS)); + if (partition->is_spilled()) { + return AppendSpilledRow(partition, row); + } + } +} + +Status NewPartitionedAggregationNode::ProcessBatchStreaming(bool needs_serialize, + RowBatch* in_batch, RowBatch* out_batch, + NewPartitionedHashTableCtx* ht_ctx, int remaining_capacity[PARTITION_FANOUT]) { + DCHECK(is_streaming_preagg_); + DCHECK_EQ(out_batch->num_rows(), 0); + DCHECK_LE(in_batch->num_rows(), out_batch->capacity()); + + RowBatch::Iterator out_batch_iterator(out_batch, out_batch->num_rows()); + NewPartitionedHashTableCtx::ExprValuesCache* expr_vals_cache = ht_ctx->expr_values_cache(); + const int num_rows = in_batch->num_rows(); + const int cache_size = expr_vals_cache->capacity(); + for (int group_start = 0; group_start < num_rows; group_start += cache_size) { + EvalAndHashPrefetchGroup(in_batch, group_start, ht_ctx); + + FOREACH_ROW_LIMIT(in_batch, group_start, cache_size, in_batch_iter) { + // Hoist lookups out of non-null branch to speed up non-null case. + TupleRow* in_row = in_batch_iter.get(); + const uint32_t hash = expr_vals_cache->CurExprValuesHash(); + const uint32_t partition_idx = hash >> (32 - NUM_PARTITIONING_BITS); + if (!expr_vals_cache->IsRowNull() && + !TryAddToHashTable(ht_ctx, hash_partitions_[partition_idx], + GetHashTable(partition_idx), in_row, hash, &remaining_capacity[partition_idx], + &process_batch_status_)) { + RETURN_IF_ERROR(std::move(process_batch_status_)); + // Tuple is not going into hash table, add it to the output batch. + Tuple* intermediate_tuple = ConstructIntermediateTuple(agg_fn_evals_, + out_batch->tuple_data_pool(), &process_batch_status_); + if (UNLIKELY(intermediate_tuple == NULL)) { + DCHECK(!process_batch_status_.ok()); + return std::move(process_batch_status_); + } + UpdateTuple(agg_fn_evals_.data(), intermediate_tuple, in_row); + out_batch_iterator.get()->set_tuple(0, intermediate_tuple); + out_batch_iterator.next(); + out_batch->commit_last_row(); + } + DCHECK(process_batch_status_.ok()); + expr_vals_cache->NextRow(); + } + DCHECK(expr_vals_cache->AtEnd()); + } + if (needs_serialize) { + FOREACH_ROW(out_batch, 0, out_batch_iter) { + NewAggFnEvaluator::Serialize(agg_fn_evals_, out_batch_iter.get()->get_tuple(0)); + } + } + + return Status::OK; +} + +bool NewPartitionedAggregationNode::TryAddToHashTable( + NewPartitionedHashTableCtx* ht_ctx, Partition* partition, + NewPartitionedHashTable* hash_tbl, TupleRow* in_row, + uint32_t hash, int* remaining_capacity, Status* status) { + DCHECK(remaining_capacity != NULL); + DCHECK_EQ(hash_tbl, partition->hash_tbl.get()); + DCHECK_GE(*remaining_capacity, 0); + bool found; + // This is called from ProcessBatchStreaming() so the rows are not aggregated. + NewPartitionedHashTable::Iterator it = hash_tbl->FindBuildRowBucket(ht_ctx, &found); + Tuple* intermediate_tuple; + if (found) { + intermediate_tuple = it.GetTuple(); + } else if (*remaining_capacity == 0) { + return false; + } else { + intermediate_tuple = ConstructIntermediateTuple(partition->agg_fn_evals, + partition->aggregated_row_stream.get(), status); + if (LIKELY(intermediate_tuple != NULL)) { + it.SetTuple(intermediate_tuple, hash); + --(*remaining_capacity); + } else { + // Avoid repeatedly trying to add tuples when under memory pressure. + *remaining_capacity = 0; + return false; + } + } + + UpdateTuple(partition->agg_fn_evals.data(), intermediate_tuple, in_row); + return true; +} + +// Instantiate required templates. +template Status NewPartitionedAggregationNode::ProcessBatch(RowBatch*, + NewPartitionedHashTableCtx*); +template Status NewPartitionedAggregationNode::ProcessBatch(RowBatch*, + NewPartitionedHashTableCtx*); + diff --git a/be/src/exec/new_partitioned_hash_table.cc b/be/src/exec/new_partitioned_hash_table.cc new file mode 100644 index 0000000000..170507eb15 --- /dev/null +++ b/be/src/exec/new_partitioned_hash_table.cc @@ -0,0 +1,1260 @@ +// Modifications copyright (C) 2017, Baidu.com, Inc. +// Copyright 2017 The Apache Software Foundation + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "exec/new_partitioned_hash_table.inline.h" + +#include +#include +#include + +#include "codegen/codegen_anyval.h" +#include "codegen/llvm_codegen.h" +#include "exprs/expr.h" +#include "exprs/expr_context.h" +#include "exprs/slot_ref.h" +#include "runtime/bufferpool/reservation_tracker.h" +#include "runtime/mem_tracker.h" +#include "runtime/raw_value.h" +#include "runtime/runtime_state.h" +#include "runtime/string_value.h" +#include "util/debug_util.h" +#include "util/palo_metrics.h" + +#include "common/names.h" + +using namespace palo; +// using namespace llvm; +using namespace strings; + +// DEFINE_bool(enable_quadratic_probing, true, "Enable quadratic probing hash table"); + +const char* NewPartitionedHashTableCtx::LLVM_CLASS_NAME = "class.palo::NewPartitionedHashTableCtx"; + +// Random primes to multiply the seed with. +static uint32_t SEED_PRIMES[] = { + 1, // First seed must be 1, level 0 is used by other operators in the fragment. + 1431655781, + 1183186591, + 622729787, + 472882027, + 338294347, + 275604541, + 41161739, + 29999999, + 27475109, + 611603, + 16313357, + 11380003, + 21261403, + 33393119, + 101, + 71043403 +}; + +// Put a non-zero constant in the result location for NULL. +// We don't want(NULL, 1) to hash to the same as (0, 1). +// This needs to be as big as the biggest primitive type since the bytes +// get copied directly. +// TODO find a better approach, since primitives like CHAR(N) can be up +// to 255 bytes +static int64_t NULL_VALUE[] = { + HashUtil::FNV_SEED, HashUtil::FNV_SEED, HashUtil::FNV_SEED, HashUtil::FNV_SEED, + HashUtil::FNV_SEED, HashUtil::FNV_SEED, HashUtil::FNV_SEED, HashUtil::FNV_SEED, + HashUtil::FNV_SEED, HashUtil::FNV_SEED, HashUtil::FNV_SEED, HashUtil::FNV_SEED, + HashUtil::FNV_SEED, HashUtil::FNV_SEED, HashUtil::FNV_SEED, HashUtil::FNV_SEED, + HashUtil::FNV_SEED, HashUtil::FNV_SEED, HashUtil::FNV_SEED, HashUtil::FNV_SEED, + HashUtil::FNV_SEED, HashUtil::FNV_SEED, HashUtil::FNV_SEED, HashUtil::FNV_SEED, + HashUtil::FNV_SEED, HashUtil::FNV_SEED, HashUtil::FNV_SEED, HashUtil::FNV_SEED, + HashUtil::FNV_SEED, HashUtil::FNV_SEED, HashUtil::FNV_SEED, HashUtil::FNV_SEED +}; + +NewPartitionedHashTableCtx::NewPartitionedHashTableCtx(const std::vector& build_exprs, + const std::vector& probe_exprs, bool stores_nulls, + const std::vector& finds_nulls, int32_t initial_seed, + int max_levels, MemPool* mem_pool) + : build_exprs_(build_exprs), + probe_exprs_(probe_exprs), + stores_nulls_(stores_nulls), + finds_nulls_(finds_nulls), + finds_some_nulls_(std::accumulate( + finds_nulls_.begin(), finds_nulls_.end(), false, std::logical_or())), + level_(0), + scratch_row_(NULL), + mem_pool_(mem_pool) { + DCHECK(!finds_some_nulls_ || stores_nulls_); + // Compute the layout and buffer size to store the evaluated expr results + DCHECK_EQ(build_exprs_.size(), probe_exprs_.size()); + DCHECK_EQ(build_exprs_.size(), finds_nulls_.size()); + DCHECK(!build_exprs_.empty()); + + // Populate the seeds to use for all the levels. TODO: revisit how we generate these. + DCHECK_GE(max_levels, 0); + DCHECK_LT(max_levels, sizeof(SEED_PRIMES) / sizeof(SEED_PRIMES[0])); + DCHECK_NE(initial_seed, 0); + seeds_.resize(max_levels + 1); + seeds_[0] = initial_seed; + for (int i = 1; i <= max_levels; ++i) { + seeds_[i] = seeds_[i - 1] * SEED_PRIMES[i]; + } +} + +Status NewPartitionedHashTableCtx::Init(ObjectPool* pool, RuntimeState* state, int num_build_tuples, + MemTracker* tracker, const RowDescriptor& row_desc, const RowDescriptor& row_desc_probe) { + + int scratch_row_size = sizeof(Tuple*) * num_build_tuples; + scratch_row_ = reinterpret_cast(malloc(scratch_row_size)); + if (UNLIKELY(scratch_row_ == NULL)) { + return Status(Substitute("Failed to allocate $0 bytes for scratch row of " + "NewPartitionedHashTableCtx.", scratch_row_size)); + } + + // TODO chenhao replace ExprContext with ScalarFnEvaluator + for (int i = 0; i < build_exprs_.size(); i++) { + ExprContext* context = pool->add(new ExprContext(build_exprs_[i])); + context->prepare(state, row_desc, tracker); + if (context == nullptr) { + return Status("Hashtable init error."); + } + build_expr_evals_.push_back(context); + } + DCHECK_EQ(build_exprs_.size(), build_expr_evals_.size()); + + for (int i = 0; i < probe_exprs_.size(); i++) { + ExprContext* context = pool->add(new ExprContext(probe_exprs_[i])); + context->prepare(state, row_desc_probe, tracker); + if (context == nullptr) { + return Status("Hashtable init error."); + } + probe_expr_evals_.push_back(context); + } + DCHECK_EQ(probe_exprs_.size(), probe_expr_evals_.size()); + return expr_values_cache_.Init(state, mem_pool_->mem_tracker(), build_exprs_); +} + +Status NewPartitionedHashTableCtx::Create(ObjectPool* pool, RuntimeState* state, + const std::vector& build_exprs, + const std::vector& probe_exprs, bool stores_nulls, + const std::vector& finds_nulls, int32_t initial_seed, int max_levels, + int num_build_tuples, MemPool* mem_pool, + MemTracker* tracker, const RowDescriptor& row_desc, + const RowDescriptor& row_desc_probe, + scoped_ptr* ht_ctx) { + ht_ctx->reset(new NewPartitionedHashTableCtx(build_exprs, probe_exprs, stores_nulls, + finds_nulls, initial_seed, max_levels, mem_pool)); + return (*ht_ctx)->Init(pool, state, num_build_tuples, tracker, row_desc, row_desc_probe); +} + +Status NewPartitionedHashTableCtx::Open(RuntimeState* state) { + // TODO chenhao replace ExprContext with ScalarFnEvaluator + for (int i = 0; i < build_expr_evals_.size(); i++) { + RETURN_IF_ERROR(build_expr_evals_[i]->open(state)); + } + for (int i = 0; i < probe_expr_evals_.size(); i++) { + RETURN_IF_ERROR(probe_expr_evals_[i]->open(state)); + } + return Status::OK; +} + +void NewPartitionedHashTableCtx::Close(RuntimeState* state) { + free(scratch_row_); + scratch_row_ = NULL; + expr_values_cache_.Close(mem_pool_->mem_tracker()); + for (int i = 0; i < build_expr_evals_.size(); i++) { + build_expr_evals_[i]->close(state); + } + + for (int i = 0; i < probe_expr_evals_.size(); i++) { + probe_expr_evals_[i]->close(state); + } + + // TODO chenhao release new expr in Init, remove this after merging + // ScalarFnEvaluator. + build_expr_evals_.clear(); + probe_expr_evals_.clear(); +} + +void NewPartitionedHashTableCtx::FreeBuildLocalAllocations() { + //ExprContext::FreeLocalAllocations(build_expr_evals_); +} + +void NewPartitionedHashTableCtx::FreeProbeLocalAllocations() { + //ExprContext::FreeLocalAllocations(probe_expr_evals_); +} + +void NewPartitionedHashTableCtx::FreeLocalAllocations() { + FreeBuildLocalAllocations(); + FreeProbeLocalAllocations(); +} + +uint32_t NewPartitionedHashTableCtx::Hash(const void* input, int len, uint32_t hash) const { + /// Use CRC hash at first level for better performance. Switch to murmur hash at + /// subsequent levels since CRC doesn't randomize well with different seed inputs. + if (level_ == 0) return HashUtil::hash(input, len, hash); + return HashUtil::murmur_hash2_64(input, len, hash); +} + +uint32_t NewPartitionedHashTableCtx::HashRow( + const uint8_t* expr_values, const uint8_t* expr_values_null) const noexcept { + DCHECK_LT(level_, seeds_.size()); + if (expr_values_cache_.var_result_offset() == -1) { + /// This handles NULLs implicitly since a constant seed value was put + /// into results buffer for nulls. + return Hash( + expr_values, expr_values_cache_.expr_values_bytes_per_row(), seeds_[level_]); + } else { + return NewPartitionedHashTableCtx::HashVariableLenRow(expr_values, expr_values_null); + } +} + +bool NewPartitionedHashTableCtx::EvalRow(TupleRow* row, const vector& ctxs, + uint8_t* expr_values, uint8_t* expr_values_null) noexcept { + bool has_null = false; + for (int i = 0; i < ctxs.size(); ++i) { + void* loc = expr_values_cache_.ExprValuePtr(expr_values, i); + void* val = ctxs[i]->get_value(row); + if (val == NULL) { + // If the table doesn't store nulls, no reason to keep evaluating + if (!stores_nulls_) return true; + expr_values_null[i] = true; + val = reinterpret_cast(&NULL_VALUE); + has_null = true; + } else { + expr_values_null[i] = false; + } + DCHECK_LE(build_exprs_[i]->type().get_slot_size(), + sizeof(NULL_VALUE)); + RawValue::write(val, loc, build_exprs_[i]->type(), NULL); + } + return has_null; +} + +uint32_t NewPartitionedHashTableCtx::HashVariableLenRow(const uint8_t* expr_values, + const uint8_t* expr_values_null) const { + uint32_t hash = seeds_[level_]; + int var_result_offset = expr_values_cache_.var_result_offset(); + // Hash the non-var length portions (if there are any) + if (var_result_offset != 0) { + hash = Hash(expr_values, var_result_offset, hash); + } + + for (int i = 0; i < build_exprs_.size(); ++i) { + // non-string and null slots are already part of 'expr_values'. + // if (build_expr_ctxs_[i]->root()->type().type != TYPE_STRING + if (build_exprs_[i]->type().type != TYPE_VARCHAR) continue; + + const void* loc = expr_values_cache_.ExprValuePtr(expr_values, i); + if (expr_values_null[i]) { + // Hash the null random seed values at 'loc' + hash = Hash(loc, sizeof(StringValue), hash); + } else { + // Hash the string + // TODO: when using CRC hash on empty string, this only swaps bytes. + const StringValue* str = reinterpret_cast(loc); + hash = Hash(str->ptr, str->len, hash); + } + } + return hash; +} + +template +bool NewPartitionedHashTableCtx::Equals(TupleRow* build_row, const uint8_t* expr_values, + const uint8_t* expr_values_null) const noexcept { + for (int i = 0; i < build_expr_evals_.size(); ++i) { + void* val = build_expr_evals_[i]->get_value(build_row); + if (val == NULL) { + if (!(FORCE_NULL_EQUALITY || finds_nulls_[i])) return false; + if (!expr_values_null[i]) return false; + continue; + } else { + if (expr_values_null[i]) return false; + } + + const void* loc = expr_values_cache_.ExprValuePtr(expr_values, i); + if (!RawValue::eq(loc, val, build_exprs_[i]->type())) { + return false; + } + } + return true; +} + +template bool NewPartitionedHashTableCtx::Equals(TupleRow* build_row, + const uint8_t* expr_values, const uint8_t* expr_values_null) const; +template bool NewPartitionedHashTableCtx::Equals(TupleRow* build_row, + const uint8_t* expr_values, const uint8_t* expr_values_null) const; + +NewPartitionedHashTableCtx::ExprValuesCache::ExprValuesCache() + : capacity_(0), + cur_expr_values_(NULL), + cur_expr_values_null_(NULL), + cur_expr_values_hash_(NULL), + cur_expr_values_hash_end_(NULL), + expr_values_array_(NULL), + expr_values_null_array_(NULL), + expr_values_hash_array_(NULL), + null_bitmap_(0) {} + +Status NewPartitionedHashTableCtx::ExprValuesCache::Init(RuntimeState* state, + MemTracker* tracker, const std::vector& build_exprs) { + // Initialize the number of expressions. + num_exprs_ = build_exprs.size(); + // Compute the layout of evaluated values of a row. + expr_values_bytes_per_row_ = Expr::compute_results_layout(build_exprs, + &expr_values_offsets_, &var_result_offset_); + if (expr_values_bytes_per_row_ == 0) { + DCHECK_EQ(num_exprs_, 0); + return Status::OK; + } + DCHECK_GT(expr_values_bytes_per_row_, 0); + // Compute the maximum number of cached rows which can fit in the memory budget. + // TODO: Find the optimal prefetch batch size. This may be something + // processor dependent so we may need calibration at Impala startup time. + capacity_ = std::max(1, std::min(state->batch_size(), + MAX_EXPR_VALUES_ARRAY_SIZE / expr_values_bytes_per_row_)); + + int mem_usage = MemUsage(capacity_, expr_values_bytes_per_row_, num_exprs_); + if (UNLIKELY(!tracker->try_consume(mem_usage))) { + capacity_ = 0; + string details = Substitute("NewPartitionedHashTableCtx::ExprValuesCache failed to allocate $0 bytes.", + mem_usage); + return tracker->MemLimitExceeded(state, details, mem_usage); + } + + int expr_values_size = expr_values_bytes_per_row_ * capacity_; + expr_values_array_.reset(new uint8_t[expr_values_size]); + cur_expr_values_ = expr_values_array_.get(); + memset(cur_expr_values_, 0, expr_values_size); + + int expr_values_null_size = num_exprs_ * capacity_; + expr_values_null_array_.reset(new uint8_t[expr_values_null_size]); + cur_expr_values_null_ = expr_values_null_array_.get(); + memset(cur_expr_values_null_, 0, expr_values_null_size); + + expr_values_hash_array_.reset(new uint32_t[capacity_]); + cur_expr_values_hash_ = expr_values_hash_array_.get(); + cur_expr_values_hash_end_ = cur_expr_values_hash_; + memset(cur_expr_values_hash_, 0, sizeof(uint32) * capacity_); + + null_bitmap_.Reset(capacity_); + return Status::OK; +} + +void NewPartitionedHashTableCtx::ExprValuesCache::Close(MemTracker* tracker) { + if (capacity_ == 0) return; + cur_expr_values_ = NULL; + cur_expr_values_null_ = NULL; + cur_expr_values_hash_ = NULL; + cur_expr_values_hash_end_ = NULL; + expr_values_array_.reset(); + expr_values_null_array_.reset(); + expr_values_hash_array_.reset(); + null_bitmap_.Reset(0); + int mem_usage = MemUsage(capacity_, expr_values_bytes_per_row_, num_exprs_); + tracker->release(mem_usage); +} + +int NewPartitionedHashTableCtx::ExprValuesCache::MemUsage(int capacity, + int expr_values_bytes_per_row, int num_exprs) { + return expr_values_bytes_per_row * capacity + // expr_values_array_ + num_exprs * capacity + // expr_values_null_array_ + sizeof(uint32) * capacity + // expr_values_hash_array_ + Bitmap::MemUsage(capacity); // null_bitmap_ +} + +uint8_t* NewPartitionedHashTableCtx::ExprValuesCache::ExprValuePtr( + uint8_t* expr_values, int expr_idx) const { + return expr_values + expr_values_offsets_[expr_idx]; +} + +const uint8_t* NewPartitionedHashTableCtx::ExprValuesCache::ExprValuePtr( + const uint8_t* expr_values, int expr_idx) const { + return expr_values + expr_values_offsets_[expr_idx]; +} + +void NewPartitionedHashTableCtx::ExprValuesCache::ResetIterators() { + cur_expr_values_ = expr_values_array_.get(); + cur_expr_values_null_ = expr_values_null_array_.get(); + cur_expr_values_hash_ = expr_values_hash_array_.get(); +} + +void NewPartitionedHashTableCtx::ExprValuesCache::Reset() noexcept { + ResetIterators(); + // Set the end pointer after resetting the other pointers so they point to + // the same location. + cur_expr_values_hash_end_ = cur_expr_values_hash_; + null_bitmap_.SetAllBits(false); +} + +void NewPartitionedHashTableCtx::ExprValuesCache::ResetForRead() { + // Record the end of hash values iterator to be used in AtEnd(). + // Do it before resetting the pointers. + cur_expr_values_hash_end_ = cur_expr_values_hash_; + ResetIterators(); +} + +constexpr double NewPartitionedHashTable::MAX_FILL_FACTOR; +constexpr int64_t NewPartitionedHashTable::DATA_PAGE_SIZE; + +NewPartitionedHashTable* NewPartitionedHashTable::Create(Suballocator* allocator, bool stores_duplicates, + int num_build_tuples, BufferedTupleStream3* tuple_stream, int64_t max_num_buckets, + int64_t initial_num_buckets) { + return new NewPartitionedHashTable(config::enable_quadratic_probing, allocator, stores_duplicates, + num_build_tuples, tuple_stream, max_num_buckets, initial_num_buckets); +} + +NewPartitionedHashTable::NewPartitionedHashTable(bool quadratic_probing, Suballocator* allocator, + bool stores_duplicates, int num_build_tuples, BufferedTupleStream3* stream, + int64_t max_num_buckets, int64_t num_buckets) + : allocator_(allocator), + tuple_stream_(stream), + stores_tuples_(num_build_tuples == 1), + stores_duplicates_(stores_duplicates), + quadratic_probing_(quadratic_probing), + total_data_page_size_(0), + next_node_(NULL), + node_remaining_current_page_(0), + num_duplicate_nodes_(0), + max_num_buckets_(max_num_buckets), + buckets_(NULL), + num_buckets_(num_buckets), + num_filled_buckets_(0), + num_buckets_with_duplicates_(0), + num_build_tuples_(num_build_tuples), + has_matches_(false), + num_probes_(0), num_failed_probes_(0), travel_length_(0), num_hash_collisions_(0), + num_resizes_(0) { + DCHECK_EQ((num_buckets & (num_buckets - 1)), 0) << "num_buckets must be a power of 2"; + DCHECK_GT(num_buckets, 0) << "num_buckets must be larger than 0"; + DCHECK(stores_tuples_ || stream != NULL); +} + +Status NewPartitionedHashTable::Init(bool* got_memory) { + int64_t buckets_byte_size = num_buckets_ * sizeof(Bucket); + RETURN_IF_ERROR(allocator_->Allocate(buckets_byte_size, &bucket_allocation_)); + if (bucket_allocation_ == nullptr) { + num_buckets_ = 0; + *got_memory = false; + return Status::OK; + } + buckets_ = reinterpret_cast(bucket_allocation_->data()); + memset(buckets_, 0, buckets_byte_size); + *got_memory = true; + return Status::OK; +} + +void NewPartitionedHashTable::Close() { + // Print statistics only for the large or heavily used hash tables. + // TODO: Tweak these numbers/conditions, or print them always? + const int64_t LARGE_HT = 128 * 1024; + const int64_t HEAVILY_USED = 1024 * 1024; + // TODO: These statistics should go to the runtime profile as well. + if ((num_buckets_ > LARGE_HT) || (num_probes_ > HEAVILY_USED)) VLOG(2) << PrintStats(); + for (auto& data_page : data_pages_) allocator_->Free(move(data_page)); + data_pages_.clear(); + //if (PaloMetrics::hash_table_total_bytes() != NULL) { + // PaloMetrics::hash_table_total_bytes()->increment(-total_data_page_size_); + //} + if (bucket_allocation_ != nullptr) allocator_->Free(move(bucket_allocation_)); +} + +Status NewPartitionedHashTable::CheckAndResize( + uint64_t buckets_to_fill, const NewPartitionedHashTableCtx* ht_ctx, bool* got_memory) { + uint64_t shift = 0; + while (num_filled_buckets_ + buckets_to_fill > + (num_buckets_ << shift) * MAX_FILL_FACTOR) { + ++shift; + } + if (shift > 0) return ResizeBuckets(num_buckets_ << shift, ht_ctx, got_memory); + *got_memory = true; + return Status::OK; +} + +Status NewPartitionedHashTable::ResizeBuckets( + int64_t num_buckets, const NewPartitionedHashTableCtx* ht_ctx, bool* got_memory) { + DCHECK_EQ((num_buckets & (num_buckets - 1)), 0) + << "num_buckets=" << num_buckets << " must be a power of 2"; + DCHECK_GT(num_buckets, num_filled_buckets_) + << "Cannot shrink the hash table to smaller number of buckets than the number of " + << "filled buckets."; + VLOG(2) << "Resizing hash table from " << num_buckets_ << " to " << num_buckets + << " buckets."; + if (max_num_buckets_ != -1 && num_buckets > max_num_buckets_) { + *got_memory = false; + return Status::OK; + } + ++num_resizes_; + + // All memory that can grow proportional to the input should come from the block mgrs + // mem tracker. + // Note that while we copying over the contents of the old hash table, we need to have + // allocated both the old and the new hash table. Once we finish, we return the memory + // of the old hash table. + // int64_t old_size = num_buckets_ * sizeof(Bucket); + int64_t new_size = num_buckets * sizeof(Bucket); + + std::unique_ptr new_allocation; + RETURN_IF_ERROR(allocator_->Allocate(new_size, &new_allocation)); + if (new_allocation == NULL) { + *got_memory = false; + return Status::OK; + } + Bucket* new_buckets = reinterpret_cast(new_allocation->data()); + memset(new_buckets, 0, new_size); + + // Walk the old table and copy all the filled buckets to the new (resized) table. + // We do not have to do anything with the duplicate nodes. This operation is expected + // to succeed. + for (NewPartitionedHashTable::Iterator iter = Begin(ht_ctx); !iter.AtEnd(); + NextFilledBucket(&iter.bucket_idx_, &iter.node_)) { + Bucket* bucket_to_copy = &buckets_[iter.bucket_idx_]; + bool found = false; + int64_t bucket_idx = + Probe(new_buckets, num_buckets, NULL, bucket_to_copy->hash, &found); + DCHECK(!found); + DCHECK_NE(bucket_idx, Iterator::BUCKET_NOT_FOUND) << " Probe failed even though " + " there are free buckets. " << num_buckets << " " << num_filled_buckets_; + Bucket* dst_bucket = &new_buckets[bucket_idx]; + *dst_bucket = *bucket_to_copy; + } + + num_buckets_ = num_buckets; + allocator_->Free(move(bucket_allocation_)); + bucket_allocation_ = std::move(new_allocation); + buckets_ = reinterpret_cast(bucket_allocation_->data()); + *got_memory = true; + return Status::OK; +} + +bool NewPartitionedHashTable::GrowNodeArray(Status* status) { + std::unique_ptr allocation; + *status = allocator_->Allocate(DATA_PAGE_SIZE, &allocation); + if (!status->ok() || allocation == nullptr) return false; + next_node_ = reinterpret_cast(allocation->data()); + data_pages_.push_back(std::move(allocation)); + //PaloMetrics::hash_table_total_bytes()->increment(DATA_PAGE_SIZE); + node_remaining_current_page_ = DATA_PAGE_SIZE / sizeof(DuplicateNode); + total_data_page_size_ += DATA_PAGE_SIZE; + return true; +} + +void NewPartitionedHashTable::DebugStringTuple(std::stringstream& ss, HtData& htdata, + const RowDescriptor* desc) { + if (stores_tuples_) { + ss << "(" << htdata.tuple << ")"; + } else { + ss << "(" << htdata.flat_row << ")"; + } + if (desc != NULL) { + Tuple* row[num_build_tuples_]; + ss << " " << print_row(GetRow(htdata, reinterpret_cast(row)), *desc); + } +} + +string NewPartitionedHashTable::DebugString(bool skip_empty, bool show_match, + const RowDescriptor* desc) { + std::stringstream ss; + ss << std::endl; + for (int i = 0; i < num_buckets_; ++i) { + if (skip_empty && !buckets_[i].filled) continue; + ss << i << ": "; + if (show_match) { + if (buckets_[i].matched) { + ss << " [M]"; + } else { + ss << " [U]"; + } + } + if (buckets_[i].hasDuplicates) { + DuplicateNode* node = buckets_[i].bucketData.duplicates; + bool first = true; + ss << " [D] "; + while (node != NULL) { + if (!first) ss << ","; + DebugStringTuple(ss, node->htdata, desc); + node = node->next; + first = false; + } + } else { + ss << " [B] "; + if (buckets_[i].filled) { + DebugStringTuple(ss, buckets_[i].bucketData.htdata, desc); + } else { + ss << " - "; + } + } + ss << std::endl; + } + return ss.str(); +} + +string NewPartitionedHashTable::PrintStats() const { + double curr_fill_factor = (double)num_filled_buckets_/(double)num_buckets_; + double avg_travel = (double)travel_length_/(double)num_probes_; + double avg_collisions = (double)num_hash_collisions_/(double)num_filled_buckets_; + std::stringstream ss; + ss << "Buckets: " << num_buckets_ << " " << num_filled_buckets_ << " " + << curr_fill_factor << std::endl; + ss << "Duplicates: " << num_buckets_with_duplicates_ << " buckets " + << num_duplicate_nodes_ << " nodes" << std::endl; + ss << "Probes: " << num_probes_ << std::endl; + ss << "FailedProbes: " << num_failed_probes_ << std::endl; + ss << "Travel: " << travel_length_ << " " << avg_travel << std::endl; + ss << "HashCollisions: " << num_hash_collisions_ << " " << avg_collisions << std::endl; + ss << "Resizes: " << num_resizes_ << std::endl; + return ss.str(); +} + +#if 0 + +// Helper function to store a value into the results buffer if the expr +// evaluated to NULL. We don't want (NULL, 1) to hash to the same as (0,1) so +// we'll pick a more random value. +static void CodegenAssignNullValue( + LlvmCodeGen* codegen, LlvmBuilder* builder, Value* dst, const ColumnType& type) { + uint64_t fnv_seed = HashUtil::FNV_SEED; + + if (type.type == TYPE_STRING || type.type == TYPE_VARCHAR) { + Value* dst_ptr = builder->CreateStructGEP(NULL, dst, 0, "string_ptr"); + Value* dst_len = builder->CreateStructGEP(NULL, dst, 1, "string_len"); + Value* null_len = codegen->GetIntConstant(TYPE_INT, fnv_seed); + Value* null_ptr = builder->CreateIntToPtr(null_len, codegen->ptr_type()); + builder->CreateStore(null_ptr, dst_ptr); + builder->CreateStore(null_len, dst_len); + } else { + Value* null_value = NULL; + int byte_size = type.GetByteSize(); + // Get a type specific representation of fnv_seed + switch (type.type) { + case TYPE_BOOLEAN: + // In results, booleans are stored as 1 byte + dst = builder->CreateBitCast(dst, codegen->ptr_type()); + null_value = codegen->GetIntConstant(TYPE_TINYINT, fnv_seed); + break; + case TYPE_TIMESTAMP: { + // Cast 'dst' to 'i128*' + DCHECK_EQ(byte_size, 16); + PointerType* fnv_seed_ptr_type = + codegen->GetPtrType(Type::getIntNTy(codegen->context(), byte_size * 8)); + dst = builder->CreateBitCast(dst, fnv_seed_ptr_type); + null_value = codegen->GetIntConstant(byte_size, fnv_seed, fnv_seed); + break; + } + case TYPE_TINYINT: + case TYPE_SMALLINT: + case TYPE_INT: + case TYPE_BIGINT: + case TYPE_DECIMAL: + null_value = codegen->GetIntConstant(byte_size, fnv_seed, fnv_seed); + break; + case TYPE_FLOAT: { + // Don't care about the value, just the bit pattern + float fnv_seed_float = *reinterpret_cast(&fnv_seed); + null_value = ConstantFP::get(codegen->context(), APFloat(fnv_seed_float)); + break; + } + case TYPE_DOUBLE: { + // Don't care about the value, just the bit pattern + double fnv_seed_double = *reinterpret_cast(&fnv_seed); + null_value = ConstantFP::get(codegen->context(), APFloat(fnv_seed_double)); + break; + } + default: + DCHECK(false); + } + builder->CreateStore(null_value, dst); + } +} + +// Codegen for evaluating a tuple row over either build_expr_ctxs_ or probe_expr_ctxs_. +// For a group by with (big int, string) the IR looks like: +// +// define i1 @EvalProbeRow(%"class.impala::NewPartitionedHashTableCtx"* %this_ptr, +// %"class.impala::TupleRow"* %row, i8* %expr_values, i8* %expr_values_null) #34 { +// entry: +// %loc_addr = getelementptr i8, i8* %expr_values, i32 0 +// %loc = bitcast i8* %loc_addr to i64* +// %result = call { i8, i64 } @GetSlotRef.2(%"class.impala::ExprContext"* +// inttoptr (i64 197737664 to %"class.impala::ExprContext"*), +// %"class.impala::TupleRow"* %row) +// %0 = extractvalue { i8, i64 } %result, 0 +// %is_null = trunc i8 %0 to i1 +// %1 = zext i1 %is_null to i8 +// %null_byte_loc = getelementptr i8, i8* %expr_values_null, i32 0 +// store i8 %1, i8* %null_byte_loc +// br i1 %is_null, label %null, label %not_null +// +// null: ; preds = %entry +// store i64 2166136261, i64* %loc +// br label %continue +// +// not_null: ; preds = %entry +// %val = extractvalue { i8, i64 } %result, 1 +// store i64 %val, i64* %loc +// br label %continue +// +// continue: ; preds = %not_null, %null +// %is_null_phi = phi i1 [ true, %null ], [ false, %not_null ] +// %has_null = or i1 false, %is_null_phi +// %loc_addr1 = getelementptr i8, i8* %expr_values, i32 8 +// %loc2 = bitcast i8* %loc_addr1 to %"struct.impala::StringValue"* +// %result6 = call { i64, i8* } @GetSlotRef.3(%"class.impala::ExprContext"* +// inttoptr (i64 197738048 to %"class.impala::ExprContext"*), +// %"class.impala::TupleRow"* %row) +// %2 = extractvalue { i64, i8* } %result6, 0 +// %is_null7 = trunc i64 %2 to i1 +// %3 = zext i1 %is_null7 to i8 +// %null_byte_loc8 = getelementptr i8, i8* %expr_values_null, i32 1 +// store i8 %3, i8* %null_byte_loc8 +// br i1 %is_null7, label %null3, label %not_null4 +// +// null3: ; preds = %continue +// %string_ptr = getelementptr inbounds %"struct.impala::StringValue", +// %"struct.impala::StringValue"* %loc2, i32 0, i32 0 +// %string_len = getelementptr inbounds %"struct.impala::StringValue", +// %"struct.impala::StringValue"* %loc2, i32 0, i32 1 +// store i8* inttoptr (i32 -2128831035 to i8*), i8** %string_ptr +// store i32 -2128831035, i32* %string_len +// br label %continue5 +// +// not_null4: ; preds = %continue +// %4 = extractvalue { i64, i8* } %result6, 0 +// %5 = ashr i64 %4, 32 +// %6 = trunc i64 %5 to i32 +// %7 = insertvalue %"struct.impala::StringValue" zeroinitializer, i32 %6, 1 +// %result9 = extractvalue { i64, i8* } %result6, 1 +// %8 = insertvalue %"struct.impala::StringValue" %7, i8* %result9, 0 +// store %"struct.impala::StringValue" %8, %"struct.impala::StringValue"* %loc2 +// br label %continue5 +// +// continue5: ; preds = %not_null4, %null3 +// %is_null_phi10 = phi i1 [ true, %null3 ], [ false, %not_null4 ] +// %has_null11 = or i1 %has_null, %is_null_phi10 +// ret i1 %has_null11 +// } +// +// For each expr, we create 3 code blocks. The null, not null and continue blocks. +// Both the null and not null branch into the continue block. The continue block +// becomes the start of the next block for codegen (either the next expr or just the +// end of the function). +Status NewPartitionedHashTableCtx::CodegenEvalRow(LlvmCodeGen* codegen, bool build, Function** fn) { + const vector& ctxs = build ? build_expr_ctxs_ : probe_expr_ctxs_; + for (int i = 0; i < ctxs.size(); ++i) { + // Disable codegen for CHAR + if (ctxs[i]->root()->type().type == TYPE_CHAR) { + return Status("NewPartitionedHashTableCtx::CodegenEvalRow(): CHAR NYI"); + } + } + + // Get types to generate function prototype + Type* this_type = codegen->GetType(NewPartitionedHashTableCtx::LLVM_CLASS_NAME); + DCHECK(this_type != NULL); + PointerType* this_ptr_type = codegen->GetPtrType(this_type); + Type* tuple_row_type = codegen->GetType(TupleRow::LLVM_CLASS_NAME); + DCHECK(tuple_row_type != NULL); + PointerType* tuple_row_ptr_type = codegen->GetPtrType(tuple_row_type); + LlvmCodeGen::FnPrototype prototype(codegen, build ? "EvalBuildRow" : "EvalProbeRow", + codegen->GetType(TYPE_BOOLEAN)); + prototype.AddArgument(LlvmCodeGen::NamedVariable("this_ptr", this_ptr_type)); + prototype.AddArgument(LlvmCodeGen::NamedVariable("row", tuple_row_ptr_type)); + prototype.AddArgument(LlvmCodeGen::NamedVariable("expr_values", codegen->ptr_type())); + prototype.AddArgument( + LlvmCodeGen::NamedVariable("expr_values_null", codegen->ptr_type())); + + LLVMContext& context = codegen->context(); + LlvmBuilder builder(context); + Value* args[4]; + *fn = prototype.GeneratePrototype(&builder, args); + Value* this_ptr = args[0]; + Value* row = args[1]; + Value* expr_values = args[2]; + Value* expr_values_null = args[3]; + Value* has_null = codegen->false_value(); + + // ctx_vector = &build_expr_ctxs_[0] / ctx_vector = &probe_expr_ctxs_[0] + Value* ctx_vector = codegen->CodegenCallFunction(&builder, build ? + IRFunction::HASH_TABLE_GET_BUILD_EXPR_CTX : + IRFunction::HASH_TABLE_GET_PROBE_EXPR_CTX, + this_ptr, "ctx_vector"); + + for (int i = 0; i < ctxs.size(); ++i) { + // TODO: refactor this to somewhere else? This is not hash table specific except for + // the null handling bit and would be used for anyone that needs to materialize a + // vector of exprs + // Convert result buffer to llvm ptr type + int offset = expr_values_cache_.expr_values_offsets(i); + Value* loc = builder.CreateInBoundsGEP( + NULL, expr_values, codegen->GetIntConstant(TYPE_INT, offset), "loc_addr"); + Value* llvm_loc = builder.CreatePointerCast( + loc, codegen->GetPtrType(ctxs[i]->root()->type()), "loc"); + + BasicBlock* null_block = BasicBlock::Create(context, "null", *fn); + BasicBlock* not_null_block = BasicBlock::Create(context, "not_null", *fn); + BasicBlock* continue_block = BasicBlock::Create(context, "continue", *fn); + + // Call expr + Function* expr_fn; + Status status = ctxs[i]->root()->GetCodegendComputeFn(codegen, &expr_fn); + if (!status.ok()) { + (*fn)->eraseFromParent(); // deletes function + *fn = NULL; + return Status(Substitute( + "Problem with NewPartitionedHashTableCtx::CodegenEvalRow(): $0", status.GetDetail())); + } + + // Avoid bloating function by inlining too many exprs into it. + if (i >= LlvmCodeGen::CODEGEN_INLINE_EXPRS_THRESHOLD) { + codegen->SetNoInline(expr_fn); + } + + Value* expr_ctx = codegen->CodegenArrayAt(&builder, ctx_vector, i, "expr_ctx"); + CodegenAnyVal result = CodegenAnyVal::CreateCallWrapped( + codegen, &builder, ctxs[i]->root()->type(), expr_fn, {expr_ctx, row}, "result"); + Value* is_null = result.GetIsNull(); + + // Set null-byte result + Value* null_byte = builder.CreateZExt(is_null, codegen->GetType(TYPE_TINYINT)); + Value* llvm_null_byte_loc = builder.CreateInBoundsGEP( + NULL, expr_values_null, codegen->GetIntConstant(TYPE_INT, i), "null_byte_loc"); + builder.CreateStore(null_byte, llvm_null_byte_loc); + builder.CreateCondBr(is_null, null_block, not_null_block); + + // Null block + builder.SetInsertPoint(null_block); + if (!stores_nulls_) { + // hash table doesn't store nulls, no reason to keep evaluating exprs + builder.CreateRet(codegen->true_value()); + } else { + CodegenAssignNullValue(codegen, &builder, llvm_loc, ctxs[i]->root()->type()); + builder.CreateBr(continue_block); + } + + // Not null block + builder.SetInsertPoint(not_null_block); + result.ToNativePtr(llvm_loc); + builder.CreateBr(continue_block); + + // Continue block + builder.SetInsertPoint(continue_block); + if (stores_nulls_) { + // Update has_null + PHINode* is_null_phi = builder.CreatePHI(codegen->boolean_type(), 2, "is_null_phi"); + is_null_phi->addIncoming(codegen->true_value(), null_block); + is_null_phi->addIncoming(codegen->false_value(), not_null_block); + has_null = builder.CreateOr(has_null, is_null_phi, "has_null"); + } + } + builder.CreateRet(has_null); + + // Avoid inlining a large EvalRow() function into caller. + if (ctxs.size() > LlvmCodeGen::CODEGEN_INLINE_EXPR_BATCH_THRESHOLD) { + codegen->SetNoInline(*fn); + } + + *fn = codegen->FinalizeFunction(*fn); + if (*fn == NULL) { + return Status("Codegen'd NewPartitionedHashTableCtx::EvalRow() function failed verification, " + "see log"); + } + return Status::OK; +} + +// Codegen for hashing the current row. In the case with both string and non-string data +// (group by int_col, string_col), the IR looks like: +// +// define i32 @HashRow(%"class.impala::NewPartitionedHashTableCtx"* %this_ptr, i8* %expr_values, +// i8* %expr_values_null) #34 { +// entry: +// %seed = call i32 @_ZNK6impala12NewPartitionedHashTableCtx11GetHashSeedEv( +// %"class.impala::NewPartitionedHashTableCtx"* %this_ptr) +// %hash = call i32 @CrcHash8(i8* %expr_values, i32 8, i32 %seed) +// %loc_addr = getelementptr i8, i8* %expr_values, i32 8 +// %null_byte_loc = getelementptr i8, i8* %expr_values_null, i32 1 +// %null_byte = load i8, i8* %null_byte_loc +// %is_null = icmp ne i8 %null_byte, 0 +// br i1 %is_null, label %null, label %not_null +// +// null: ; preds = %entry +// %str_null = call i32 @CrcHash16(i8* %loc_addr, i32 16, i32 %hash) +// br label %continue +// +// not_null: ; preds = %entry +// %str_val = bitcast i8* %loc_addr to %"struct.impala::StringValue"* +// %0 = getelementptr inbounds %"struct.impala::StringValue", +// %"struct.impala::StringValue"* %str_val, i32 0, i32 0 +// %1 = getelementptr inbounds %"struct.impala::StringValue", +// %"struct.impala::StringValue"* %str_val, i32 0, i32 1 +// %ptr = load i8*, i8** %0 +// %len = load i32, i32* %1 +// %string_hash = call i32 @IrCrcHash(i8* %ptr, i32 %len, i32 %hash) +// br label %continue +// +// continue: ; preds = %not_null, %null +// %hash_phi = phi i32 [ %string_hash, %not_null ], [ %str_null, %null ] +// ret i32 %hash_phi +// } +Status NewPartitionedHashTableCtx::CodegenHashRow(LlvmCodeGen* codegen, bool use_murmur, Function** fn) { + for (int i = 0; i < build_expr_ctxs_.size(); ++i) { + // Disable codegen for CHAR + if (build_expr_ctxs_[i]->root()->type().type == TYPE_CHAR) { + return Status("NewPartitionedHashTableCtx::CodegenHashRow(): CHAR NYI"); + } + } + + // Get types to generate function prototype + Type* this_type = codegen->GetType(NewPartitionedHashTableCtx::LLVM_CLASS_NAME); + DCHECK(this_type != NULL); + PointerType* this_ptr_type = codegen->GetPtrType(this_type); + + LlvmCodeGen::FnPrototype prototype( + codegen, (use_murmur ? "MurmurHashRow" : "HashRow"), codegen->GetType(TYPE_INT)); + prototype.AddArgument(LlvmCodeGen::NamedVariable("this_ptr", this_ptr_type)); + prototype.AddArgument(LlvmCodeGen::NamedVariable("expr_values", codegen->ptr_type())); + prototype.AddArgument( + LlvmCodeGen::NamedVariable("expr_values_null", codegen->ptr_type())); + + LLVMContext& context = codegen->context(); + LlvmBuilder builder(context); + Value* args[3]; + *fn = prototype.GeneratePrototype(&builder, args); + Value* this_arg = args[0]; + Value* expr_values = args[1]; + Value* expr_values_null = args[2]; + + // Call GetHashSeed() to get seeds_[level_] + Value* seed = codegen->CodegenCallFunction(&builder, + IRFunction::HASH_TABLE_GET_HASH_SEED, this_arg, "seed"); + + Value* hash_result = seed; + const int var_result_offset = expr_values_cache_.var_result_offset(); + const int expr_values_bytes_per_row = expr_values_cache_.expr_values_bytes_per_row(); + if (var_result_offset == -1) { + // No variable length slots, just hash what is in 'expr_expr_values_cache_' + if (expr_values_bytes_per_row > 0) { + Function* hash_fn = use_murmur ? + codegen->GetMurmurHashFunction(expr_values_bytes_per_row) : + codegen->GetHashFunction(expr_values_bytes_per_row); + Value* len = codegen->GetIntConstant(TYPE_INT, expr_values_bytes_per_row); + hash_result = builder.CreateCall( + hash_fn, ArrayRef({expr_values, len, hash_result}), "hash"); + } + } else { + if (var_result_offset > 0) { + Function* hash_fn = use_murmur ? + codegen->GetMurmurHashFunction(var_result_offset) : + codegen->GetHashFunction(var_result_offset); + Value* len = codegen->GetIntConstant(TYPE_INT, var_result_offset); + hash_result = builder.CreateCall( + hash_fn, ArrayRef({expr_values, len, hash_result}), "hash"); + } + + // Hash string slots + for (int i = 0; i < build_expr_ctxs_.size(); ++i) { + if (build_expr_ctxs_[i]->root()->type().type != TYPE_STRING + && build_expr_ctxs_[i]->root()->type().type != TYPE_VARCHAR) continue; + + BasicBlock* null_block = NULL; + BasicBlock* not_null_block = NULL; + BasicBlock* continue_block = NULL; + Value* str_null_result = NULL; + + int offset = expr_values_cache_.expr_values_offsets(i); + Value* llvm_loc = builder.CreateInBoundsGEP( + NULL, expr_values, codegen->GetIntConstant(TYPE_INT, offset), "loc_addr"); + + // If the hash table stores nulls, we need to check if the stringval + // evaluated to NULL + if (stores_nulls_) { + null_block = BasicBlock::Create(context, "null", *fn); + not_null_block = BasicBlock::Create(context, "not_null", *fn); + continue_block = BasicBlock::Create(context, "continue", *fn); + + Value* llvm_null_byte_loc = builder.CreateInBoundsGEP(NULL, expr_values_null, + codegen->GetIntConstant(TYPE_INT, i), "null_byte_loc"); + Value* null_byte = builder.CreateLoad(llvm_null_byte_loc, "null_byte"); + Value* is_null = builder.CreateICmpNE( + null_byte, codegen->GetIntConstant(TYPE_TINYINT, 0), "is_null"); + builder.CreateCondBr(is_null, null_block, not_null_block); + + // For null, we just want to call the hash function on the portion of + // the data + builder.SetInsertPoint(null_block); + Function* null_hash_fn = use_murmur ? + codegen->GetMurmurHashFunction(sizeof(StringValue)) : + codegen->GetHashFunction(sizeof(StringValue)); + Value* len = codegen->GetIntConstant(TYPE_INT, sizeof(StringValue)); + str_null_result = builder.CreateCall(null_hash_fn, + ArrayRef({llvm_loc, len, hash_result}), "str_null"); + builder.CreateBr(continue_block); + + builder.SetInsertPoint(not_null_block); + } + + // Convert expr_values_buffer_ loc to llvm value + Value* str_val = builder.CreatePointerCast(llvm_loc, + codegen->GetPtrType(TYPE_STRING), "str_val"); + + Value* ptr = builder.CreateStructGEP(NULL, str_val, 0); + Value* len = builder.CreateStructGEP(NULL, str_val, 1); + ptr = builder.CreateLoad(ptr, "ptr"); + len = builder.CreateLoad(len, "len"); + + // Call hash(ptr, len, hash_result); + Function* general_hash_fn = use_murmur ? codegen->GetMurmurHashFunction() : + codegen->GetHashFunction(); + Value* string_hash_result = builder.CreateCall(general_hash_fn, + ArrayRef({ptr, len, hash_result}), "string_hash"); + + if (stores_nulls_) { + builder.CreateBr(continue_block); + builder.SetInsertPoint(continue_block); + // Use phi node to reconcile that we could have come from the string-null + // path and string not null paths. + PHINode* phi_node = builder.CreatePHI(codegen->GetType(TYPE_INT), 2, "hash_phi"); + phi_node->addIncoming(string_hash_result, not_null_block); + phi_node->addIncoming(str_null_result, null_block); + hash_result = phi_node; + } else { + hash_result = string_hash_result; + } + } + } + + builder.CreateRet(hash_result); + + // Avoid inlining into caller if there are many exprs. + if (build_expr_ctxs_.size() > LlvmCodeGen::CODEGEN_INLINE_EXPR_BATCH_THRESHOLD) { + codegen->SetNoInline(*fn); + } + *fn = codegen->FinalizeFunction(*fn); + if (*fn == NULL) { + return Status( + "Codegen'd NewPartitionedHashTableCtx::HashRow() function failed verification, see log"); + } + return Status::OK; +} + +// Codegen for NewPartitionedHashTableCtx::Equals. For a group by with (bigint, string), +// the IR looks like: +// +// define i1 @Equals(%"class.impala::NewPartitionedHashTableCtx"* %this_ptr, %"class.impala::TupleRow"* +// %row, +// i8* %expr_values, i8* %expr_values_null) #34 { +// entry: +// %0 = alloca { i64, i8* } +// %result = call { i8, i64 } @GetSlotRef.2(%"class.impala::ExprContext"* +// inttoptr (i64 139107136 to %"class.impala::ExprContext"*), +// %"class.impala::TupleRow"* %row) +// %1 = extractvalue { i8, i64 } %result, 0 +// %is_null = trunc i8 %1 to i1 +// %null_byte_loc = getelementptr i8, i8* %expr_values_null, i32 0 +// %2 = load i8, i8* %null_byte_loc +// %3 = icmp ne i8 %2, 0 +// %loc = getelementptr i8, i8* %expr_values, i32 0 +// %row_val = bitcast i8* %loc to i64* +// br i1 %is_null, label %null, label %not_null +// +// false_block: ; preds = %cmp9, %not_null2, %null1, +// %cmp, %not_null, %null +// ret i1 false +// +// null: ; preds = %entry +// br i1 %3, label %continue, label %false_block +// +// not_null: ; preds = %entry +// br i1 %3, label %false_block, label %cmp +// +// continue: ; preds = %cmp, %null +// %result4 = call { i64, i8* } @GetSlotRef.3(%"class.impala::ExprContext"* +// inttoptr (i64 139107328 to %"class.impala::ExprContext"*), +// %"class.impala::TupleRow"* %row) +// %4 = extractvalue { i64, i8* } %result4, 0 +// %is_null5 = trunc i64 %4 to i1 +// %null_byte_loc6 = getelementptr i8, i8* %expr_values_null, i32 1 +// %5 = load i8, i8* %null_byte_loc6 +// %6 = icmp ne i8 %5, 0 +// %loc7 = getelementptr i8, i8* %expr_values, i32 8 +// %row_val8 = bitcast i8* %loc7 to %"struct.impala::StringValue"* +// br i1 %is_null5, label %null1, label %not_null2 +// +// cmp: ; preds = %not_null +// %7 = load i64, i64* %row_val +// %val = extractvalue { i8, i64 } %result, 1 +// %cmp_raw = icmp eq i64 %val, %7 +// br i1 %cmp_raw, label %continue, label %false_block +// +// null1: ; preds = %continue +// br i1 %6, label %continue3, label %false_block +// +// not_null2: ; preds = %continue +// br i1 %6, label %false_block, label %cmp9 +// +// continue3: ; preds = %cmp9, %null1 +// ret i1 true +// +// cmp9: ; preds = %not_null2 +// store { i64, i8* } %result4, { i64, i8* }* %0 +// %8 = bitcast { i64, i8* }* %0 to %"struct.impala_udf::StringVal"* +// %cmp_raw10 = call i1 +// @_Z13StringValueEqRKN10impala_udf9StringValERKN6impala11StringValueE( +// %"struct.impala_udf::StringVal"* %8, %"struct.impala::StringValue"* %row_val8) +// br i1 %cmp_raw10, label %continue3, label %false_block +// } +Status NewPartitionedHashTableCtx::CodegenEquals(LlvmCodeGen* codegen, bool force_null_equality, + Function** fn) { + for (int i = 0; i < build_expr_ctxs_.size(); ++i) { + // Disable codegen for CHAR + if (build_expr_ctxs_[i]->root()->type().type == TYPE_CHAR) { + return Status("NewPartitionedHashTableCtx::CodegenEquals(): CHAR NYI"); + } + } + + // Get types to generate function prototype + Type* this_type = codegen->GetType(NewPartitionedHashTableCtx::LLVM_CLASS_NAME); + DCHECK(this_type != NULL); + PointerType* this_ptr_type = codegen->GetPtrType(this_type); + Type* tuple_row_type = codegen->GetType(TupleRow::LLVM_CLASS_NAME); + DCHECK(tuple_row_type != NULL); + PointerType* tuple_row_ptr_type = codegen->GetPtrType(tuple_row_type); + + LlvmCodeGen::FnPrototype prototype(codegen, "Equals", codegen->GetType(TYPE_BOOLEAN)); + prototype.AddArgument(LlvmCodeGen::NamedVariable("this_ptr", this_ptr_type)); + prototype.AddArgument(LlvmCodeGen::NamedVariable("row", tuple_row_ptr_type)); + prototype.AddArgument(LlvmCodeGen::NamedVariable("expr_values", codegen->ptr_type())); + prototype.AddArgument( + LlvmCodeGen::NamedVariable("expr_values_null", codegen->ptr_type())); + + LLVMContext& context = codegen->context(); + LlvmBuilder builder(context); + Value* args[4]; + *fn = prototype.GeneratePrototype(&builder, args); + Value* this_ptr = args[0]; + Value* row = args[1]; + Value* expr_values = args[2]; + Value* expr_values_null = args[3]; + + // ctx_vector = &build_expr_ctxs_[0] + Value* ctx_vector = codegen->CodegenCallFunction(&builder, + IRFunction::HASH_TABLE_GET_BUILD_EXPR_CTX, this_ptr, "ctx_vector"); + + BasicBlock* false_block = BasicBlock::Create(context, "false_block", *fn); + for (int i = 0; i < build_expr_ctxs_.size(); ++i) { + BasicBlock* null_block = BasicBlock::Create(context, "null", *fn); + BasicBlock* not_null_block = BasicBlock::Create(context, "not_null", *fn); + BasicBlock* continue_block = BasicBlock::Create(context, "continue", *fn); + + // call GetValue on build_exprs[i] + Function* expr_fn; + Status status = build_expr_ctxs_[i]->root()->GetCodegendComputeFn(codegen, &expr_fn); + if (!status.ok()) { + (*fn)->eraseFromParent(); // deletes function + *fn = NULL; + return Status( + Substitute("Problem with NewPartitionedHashTableCtx::CodegenEquals: $0", status.GetDetail())); + } + if (build_expr_ctxs_.size() > LlvmCodeGen::CODEGEN_INLINE_EXPRS_THRESHOLD) { + // Avoid bloating function by inlining too many exprs into it. + codegen->SetNoInline(expr_fn); + } + + // Load ExprContext*: expr_ctx = ctx_vector[i]; + Value* expr_ctx = codegen->CodegenArrayAt(&builder, ctx_vector, i, "expr_ctx"); + + // Evaluate the expression. + CodegenAnyVal result = CodegenAnyVal::CreateCallWrapped(codegen, &builder, + build_expr_ctxs_[i]->root()->type(), expr_fn, {expr_ctx, row}, "result"); + Value* is_null = result.GetIsNull(); + + // Determine if row is null (i.e. expr_values_null[i] == true). In + // the case where the hash table does not store nulls, this is always false. + Value* row_is_null = codegen->false_value(); + + // We consider null values equal if we are comparing build rows or if the join + // predicate is <=> + if (force_null_equality || finds_nulls_[i]) { + Value* llvm_null_byte_loc = builder.CreateInBoundsGEP( + NULL, expr_values_null, codegen->GetIntConstant(TYPE_INT, i), "null_byte_loc"); + Value* null_byte = builder.CreateLoad(llvm_null_byte_loc); + row_is_null = + builder.CreateICmpNE(null_byte, codegen->GetIntConstant(TYPE_TINYINT, 0)); + } + + // Get llvm value for row_val from 'expr_values' + int offset = expr_values_cache_.expr_values_offsets(i); + Value* loc = builder.CreateInBoundsGEP( + NULL, expr_values, codegen->GetIntConstant(TYPE_INT, offset), "loc"); + Value* row_val = builder.CreatePointerCast( + loc, codegen->GetPtrType(build_expr_ctxs_[i]->root()->type()), "row_val"); + + // Branch for GetValue() returning NULL + builder.CreateCondBr(is_null, null_block, not_null_block); + + // Null block + builder.SetInsertPoint(null_block); + builder.CreateCondBr(row_is_null, continue_block, false_block); + + // Not-null block + builder.SetInsertPoint(not_null_block); + if (stores_nulls_) { + BasicBlock* cmp_block = BasicBlock::Create(context, "cmp", *fn); + // First need to compare that row expr[i] is not null + builder.CreateCondBr(row_is_null, false_block, cmp_block); + builder.SetInsertPoint(cmp_block); + } + // Check result == row_val + Value* is_equal = result.EqToNativePtr(row_val); + builder.CreateCondBr(is_equal, continue_block, false_block); + + builder.SetInsertPoint(continue_block); + } + builder.CreateRet(codegen->true_value()); + + builder.SetInsertPoint(false_block); + builder.CreateRet(codegen->false_value()); + + // Avoid inlining into caller if it is large. + if (build_expr_ctxs_.size() > LlvmCodeGen::CODEGEN_INLINE_EXPR_BATCH_THRESHOLD) { + codegen->SetNoInline(*fn); + } + *fn = codegen->FinalizeFunction(*fn); + if (*fn == NULL) { + return Status("Codegen'd NewPartitionedHashTableCtx::Equals() function failed verification, " + "see log"); + } + return Status::OK; +} + +Status NewPartitionedHashTableCtx::ReplaceHashTableConstants(LlvmCodeGen* codegen, + bool stores_duplicates, int num_build_tuples, Function* fn, + HashTableReplacedConstants* replacement_counts) { + + replacement_counts->stores_nulls = codegen->ReplaceCallSitesWithBoolConst( + fn, stores_nulls(), "stores_nulls"); + replacement_counts->finds_some_nulls = codegen->ReplaceCallSitesWithBoolConst( + fn, finds_some_nulls(), "finds_some_nulls"); + replacement_counts->stores_tuples = codegen->ReplaceCallSitesWithBoolConst( + fn, num_build_tuples == 1, "stores_tuples"); + replacement_counts->stores_duplicates = codegen->ReplaceCallSitesWithBoolConst( + fn, stores_duplicates, "stores_duplicates"); + replacement_counts->quadratic_probing = codegen->ReplaceCallSitesWithBoolConst( + fn, FLAGS_enable_quadratic_probing, "quadratic_probing"); + return Status::OK; +} + +#endif + diff --git a/be/src/exec/new_partitioned_hash_table.h b/be/src/exec/new_partitioned_hash_table.h new file mode 100644 index 0000000000..3b868803ba --- /dev/null +++ b/be/src/exec/new_partitioned_hash_table.h @@ -0,0 +1,1001 @@ +// Modifications copyright (C) 2017, Baidu.com, Inc. +// Copyright 2017 The Apache Software Foundation + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef BDG_PALO_BE_SRC_EXEC_NEW_PARTITIONED_HASH_TABLE_H +#define BDG_PALO_BE_SRC_EXEC_NEW_PARTITIONED_HASH_TABLE_H + +#include +#include +#include +#include +#include "codegen/palo_ir.h" +#include "common/logging.h" +#include "common/compiler_util.h" +#include "runtime/buffered_tuple_stream3.h" +#include "runtime/buffered_tuple_stream3.inline.h" +#include "runtime/bufferpool/buffer_pool.h" +#include "runtime/bufferpool/suballocator.h" +#include "runtime/tuple_row.h" +#include "util/bitmap.h" +#include "util/hash_util.hpp" + +namespace llvm { + class Function; +} + +namespace palo { + +class Expr; +class ExprContext; +class LlvmCodeGen; +class MemTracker; +class RowDescriptor; +class RuntimeState; +class Tuple; +class TupleRow; +class NewPartitionedHashTable; + +/// Linear or quadratic probing hash table implementation tailored to the usage pattern +/// for partitioned hash aggregation and hash joins. The hash table stores TupleRows and +/// allows for different exprs for insertions and finds. This is the pattern we use for +/// joins and aggregation where the input/build tuple row descriptor is different from the +/// find/probe descriptor. The implementation is designed to allow codegen for some paths. +// +/// In addition to the hash table there is also an accompanying hash table context that is +/// used for insertions and probes. For example, the hash table context stores evaluated +/// expr results for the current row being processed when possible into a contiguous +/// memory buffer. This allows for efficient hash computation. +// +/// The hash table does not support removes. The hash table is not thread safe. +/// The table is optimized for the partition hash aggregation and hash joins and is not +/// intended to be a generic hash table implementation. The API loosely mimics the +/// std::hashset API. +// +/// The data (rows) are stored in a BufferedTupleStream3. The basic data structure of this +/// hash table is a vector of buckets. The buckets (indexed by the mod of the hash) +/// contain a pointer to either the slot in the tuple-stream or in case of duplicate +/// values, to the head of a linked list of nodes that in turn contain a pointer to +/// tuple-stream slots. When inserting an entry we start at the bucket at position +/// (hash % size) and search for either a bucket with the same hash or for an empty +/// bucket. If a bucket with the same hash is found, we then compare for row equality and +/// either insert a duplicate node if the equality is true, or continue the search if the +/// row equality is false. Similarly, when probing we start from the bucket at position +/// (hash % size) and search for an entry with the same hash or for an empty bucket. +/// In the former case, we then check for row equality and continue the search if the row +/// equality is false. In the latter case, the probe is not successful. When growing the +/// hash table, the number of buckets is doubled. We trigger a resize when the fill +/// factor is approx 75%. Due to the doubling nature of the buckets, we require that the +/// number of buckets is a power of 2. This allows us to perform a modulo of the hash +/// using a bitmask. +/// +/// We choose to use linear or quadratic probing because they exhibit good (predictable) +/// cache behavior. +/// +/// The first NUM_SMALL_BLOCKS of nodes_ are made of blocks less than the IO size (of 8MB) +/// to reduce the memory footprint of small queries. +/// +/// TODO: Compare linear and quadratic probing and remove the loser. +/// TODO: We currently use 32-bit hashes. There is room in the bucket structure for at +/// least 48-bits. We should exploit this space. +/// TODO: Consider capping the probes with a threshold value. If an insert reaches +/// that threshold it is inserted to another linked list of overflow entries. +/// TODO: Smarter resizes, and perhaps avoid using powers of 2 as the hash table size. +/// TODO: this is not a fancy hash table in terms of memory access patterns +/// (cuckoo-hashing or something that spills to disk). We will likely want to invest +/// more time into this. +/// TODO: hash-join and aggregation have very different access patterns. Joins insert all +/// the rows and then calls scan to find them. Aggregation interleaves FindProbeRow() and +/// Inserts(). We may want to optimize joins more heavily for Inserts() (in particular +/// growing). +/// TODO: Batched interface for inserts and finds. +/// TODO: Do we need to check mem limit exceeded so often. Check once per batch? +/// TODO: as an optimization, compute variable-length data size for the agg node. + +/// Control block for a hash table. This class contains the logic as well as the variables +/// needed by a thread to operate on a hash table. +class NewPartitionedHashTableCtx { + public: + + /// Create a hash table context with the specified parameters, invoke Init() to + /// initialize the new hash table context and return it in 'ht_ctx'. Expression + /// evaluators for the build and probe expressions will also be allocated. + /// Please see the comments of HashTableCtx constructor and Init() for details + /// of other parameters. + static Status Create(ObjectPool* pool, RuntimeState* state, + const std::vector& build_exprs, + const std::vector& probe_exprs, bool stores_nulls, + const std::vector& finds_nulls, int32_t initial_seed, int max_levels, + int num_build_tuples, MemPool* mem_pool, + MemTracker* tracker, const RowDescriptor& row_desc, + const RowDescriptor& row_desc_probe, + boost::scoped_ptr* ht_ctx); + + /// Initialize the build and probe expression evaluators. + Status Open(RuntimeState* state); + + /// Call to cleanup any resources allocated by the expression evaluators. + void Close(RuntimeState* state); + + /// Free local allocations made by build and probe expression evaluators respectively. + void FreeBuildLocalAllocations(); + void FreeProbeLocalAllocations(); + + /// Free local allocations of both build and probe expression evaluators. + void FreeLocalAllocations(); + + void set_level(int level); + + int ALWAYS_INLINE level() const { return level_; } + + uint32_t ALWAYS_INLINE seed(int level) { return seeds_.at(level); } + + TupleRow* ALWAYS_INLINE scratch_row() const { return scratch_row_; } + + /// Returns the results of the expression at 'expr_idx' evaluated at the current row. + /// This value is invalid if the expr evaluated to NULL. + /// TODO: this is an awkward abstraction but aggregation node can take advantage of + /// it and save some expr evaluation calls. + void* ALWAYS_INLINE ExprValue(int expr_idx) const { + return expr_values_cache_.ExprValuePtr( + expr_values_cache_.cur_expr_values(), expr_idx); + } + + /// Returns if the expression at 'expr_idx' is evaluated to NULL for the current row. + bool ALWAYS_INLINE ExprValueNull(int expr_idx) const { + return static_cast(*(expr_values_cache_.cur_expr_values_null() + expr_idx)); + } + + /// Evaluate and hash the build/probe row, saving the evaluation to the current row of + /// the ExprValuesCache in this hash table context: the results are saved in + /// 'cur_expr_values_', the nullness of expressions values in 'cur_expr_values_null_', + /// and the hashed expression values in 'cur_expr_values_hash_'. Returns false if this + /// row should be rejected (doesn't need to be processed further) because it contains + /// NULL. These need to be inlined in the IR module so we can find and replace the + /// calls to EvalBuildRow()/EvalProbeRow(). + bool IR_ALWAYS_INLINE EvalAndHashBuild(TupleRow* row); + bool IR_ALWAYS_INLINE EvalAndHashProbe(TupleRow* row); + + /// Codegen for evaluating a tuple row. Codegen'd function matches the signature + /// for EvalBuildRow and EvalTupleRow. + /// If build_row is true, the codegen uses the build_exprs, otherwise the probe_exprs. + Status CodegenEvalRow(LlvmCodeGen* codegen, bool build_row, llvm::Function** fn); + + /// Codegen for evaluating a TupleRow and comparing equality. Function signature + /// matches HashTable::Equals(). 'force_null_equality' is true if the generated + /// equality function should treat all NULLs as equal. See the template parameter + /// to HashTable::Equals(). + Status CodegenEquals(LlvmCodeGen* codegen, bool force_null_equality, + llvm::Function** fn); + + /// Codegen for hashing expr values. Function prototype matches HashRow identically. + /// Unlike HashRow(), the returned function only uses a single hash function, rather + /// than switching based on level_. If 'use_murmur' is true, murmur hash is used, + /// otherwise CRC is used if the hardware supports it (see hash-util.h). + Status CodegenHashRow(LlvmCodeGen* codegen, bool use_murmur, llvm::Function** fn); + + /// Struct that returns the number of constants replaced by ReplaceConstants(). + struct HashTableReplacedConstants { + int stores_nulls; + int finds_some_nulls; + int stores_tuples; + int stores_duplicates; + int quadratic_probing; + }; + + /// Replace hash table parameters with constants in 'fn'. Updates 'replacement_counts' + /// with the number of replacements made. 'num_build_tuples' and 'stores_duplicates' + /// correspond to HashTable parameters with the same name. + Status ReplaceHashTableConstants(LlvmCodeGen* codegen, bool stores_duplicates, + int num_build_tuples, llvm::Function* fn, + HashTableReplacedConstants* replacement_counts); + + static const char* LLVM_CLASS_NAME; + + /// To enable prefetching, the hash table building and probing are pipelined by the + /// exec nodes. A set of rows in a row batch will be evaluated and hashed first and + /// the corresponding hash table buckets are prefetched before they are probed against + /// the hash table. ExprValuesCache is a container for caching the results of + /// expressions evaluations for the rows in a prefetch set to avoid re-evaluating the + /// rows again during probing. Expressions evaluation can be very expensive. + /// + /// The expression evaluation results are cached in the following data structures: + /// + /// - 'expr_values_array_' is an array caching the results of the rows + /// evaluated against either the build or probe expressions. 'cur_expr_values_' + /// is a pointer into this array. + /// - 'expr_values_null_array_' is an array caching the nullness of each evaluated + /// expression in each row. 'cur_expr_values_null_' is a pointer into this array. + /// - 'expr_values_hash_array_' is an array of cached hash values of the rows. + /// 'cur_expr_values_hash_' is a pointer into this array. + /// - 'null_bitmap_' is a bitmap which indicates rows evaluated to NULL. + /// + /// ExprValuesCache provides an iterator like interface for performing a write pass + /// followed by a read pass. We refrain from providing an interface for random accesses + /// as there isn't a use case for it now and we want to avoid expensive multiplication + /// as the buffer size of each row is not necessarily power of two: + /// - Reset(), ResetForRead(): reset the iterators before writing / reading cached values. + /// - NextRow(): moves the iterators to point to the next row of cached values. + /// - AtEnd(): returns true if all cached rows have been read. Valid in read mode only. + /// + /// Various metadata information such as layout of results buffer is also stored in + /// this class. Note that the result buffer doesn't store variable length data. It only + /// contains pointers to the variable length data (e.g. if an expression value is a + /// StringValue). + /// + class ExprValuesCache { + public: + ExprValuesCache(); + + /// Allocates memory and initializes various data structures. Return error status + /// if memory allocation leads to the memory limits of the exec node to be exceeded. + /// 'tracker' is the memory tracker of the exec node which owns this NewPartitionedHashTableCtx. + Status Init(RuntimeState* state, MemTracker* tracker, + const std::vector& build_exprs); + + /// Frees up various resources and updates memory tracker with proper accounting. + /// 'tracker' should be the same memory tracker which was passed in for Init(). + void Close(MemTracker* tracker); + + /// Resets the cache states (iterators, end pointers etc) before writing. + void Reset() noexcept; + + /// Resets the iterators to the start before reading. Will record the current position + /// of the iterators in end pointer before resetting so AtEnd() can determine if all + /// cached values have been read. + void ResetForRead(); + + /// Advances the iterators to the next row by moving to the next entries in the + /// arrays of cached values. + void ALWAYS_INLINE NextRow(); + + /// Compute the total memory usage of this ExprValuesCache. + static int MemUsage(int capacity, int results_buffer_size, int num_build_exprs); + + /// Returns the maximum number rows of expression values states which can be cached. + int ALWAYS_INLINE capacity() const { return capacity_; } + + /// Returns the total size in bytes of a row of evaluated expressions' values. + int ALWAYS_INLINE expr_values_bytes_per_row() const { + return expr_values_bytes_per_row_; + } + + /// Returns the offset into the result buffer of the first variable length + /// data results. + int ALWAYS_INLINE var_result_offset() const { return var_result_offset_; } + + /// Returns true if the current read pass is complete, meaning all cached values + /// have been read. + bool ALWAYS_INLINE AtEnd() const { + return cur_expr_values_hash_ == cur_expr_values_hash_end_; + } + + /// Returns true if the current row is null but nulls are not considered in the current + /// phase (build or probe). + bool ALWAYS_INLINE IsRowNull() const { return null_bitmap_.Get(CurIdx()); } + + /// Record in a bitmap that the current row is null but nulls are not considered in + /// the current phase (build or probe). + void ALWAYS_INLINE SetRowNull() { null_bitmap_.Set(CurIdx(), true); } + + /// Returns the hash values of the current row. + uint32_t ALWAYS_INLINE CurExprValuesHash() const { return *cur_expr_values_hash_; } + + /// Sets the hash values for the current row. + void ALWAYS_INLINE SetCurExprValuesHash(uint32_t hash) { + *cur_expr_values_hash_ = hash; + } + + /// Returns a pointer to the expression value at 'expr_idx' in 'expr_values'. + uint8_t* ExprValuePtr(uint8_t* expr_values, int expr_idx) const; + const uint8_t* ExprValuePtr(const uint8_t* expr_values, int expr_idx) const; + + /// Returns the current row's expression buffer. The expression values in the buffer + /// are accessed using ExprValuePtr(). + uint8_t* ALWAYS_INLINE cur_expr_values() const { return cur_expr_values_; } + + /// Returns null indicator bytes for the current row, one per expression. Non-zero + /// bytes mean NULL, zero bytes mean non-NULL. Indexed by the expression index. + /// These are uint8_t instead of bool to simplify codegen with IRBuilder. + /// TODO: is there actually a valid reason why this is necessary for codegen? + uint8_t* ALWAYS_INLINE cur_expr_values_null() const { return cur_expr_values_null_; } + + /// Returns the offset into the results buffer of the expression value at 'expr_idx'. + int ALWAYS_INLINE expr_values_offsets(int expr_idx) const { + return expr_values_offsets_[expr_idx]; + } + + private: + friend class NewPartitionedHashTableCtx; + + /// Resets the iterators to the beginning of the cache values' arrays. + void ResetIterators(); + + /// Returns the offset in number of rows into the cached values' buffer. + int ALWAYS_INLINE CurIdx() const { + return cur_expr_values_hash_ - expr_values_hash_array_.get(); + } + + /// Max amount of memory in bytes for caching evaluated expression values. + static const int MAX_EXPR_VALUES_ARRAY_SIZE = 256 << 10; + + /// Maximum number of rows of expressions evaluation states which this + /// ExprValuesCache can cache. + int capacity_; + + /// Byte size of a row of evaluated expression values. Never changes once set, + /// can be used for constant substitution during codegen. + int expr_values_bytes_per_row_; + + /// Number of build/probe expressions. + int num_exprs_; + + /// Pointer into 'expr_values_array_' for the current row's expression values. + uint8_t* cur_expr_values_; + + /// Pointer into 'expr_values_null_array_' for the current row's nullness of each + /// expression value. + uint8_t* cur_expr_values_null_; + + /// Pointer into 'expr_hash_value_array_' for the hash value of current row's + /// expression values. + uint32_t* cur_expr_values_hash_; + + /// Pointer to the buffer one beyond the end of the last entry of cached expressions' + /// hash values. + uint32_t* cur_expr_values_hash_end_; + + /// Array for caching up to 'capacity_' number of rows worth of evaluated expression + /// values. Each row consumes 'expr_values_bytes_per_row_' number of bytes. + boost::scoped_array expr_values_array_; + + /// Array for caching up to 'capacity_' number of rows worth of null booleans. + /// Each row contains 'num_exprs_' booleans to indicate nullness of expression values. + /// Used when the hash table supports NULL. Use 'uint8_t' to guarantee each entry is 1 + /// byte as sizeof(bool) is implementation dependent. The IR depends on this + /// assumption. + boost::scoped_array expr_values_null_array_; + + /// Array for caching up to 'capacity_' number of rows worth of hashed values. + boost::scoped_array expr_values_hash_array_; + + /// One bit for each row. A bit is set if that row is not hashed as it's evaluated + /// to NULL but the hash table doesn't support NULL. Such rows may still be included + /// in outputs for certain join types (e.g. left anti joins). + Bitmap null_bitmap_; + + /// Maps from expression index to the byte offset into a row of expression values. + /// One entry per build/probe expression. + std::vector expr_values_offsets_; + + /// Byte offset into 'cur_expr_values_' that begins the variable length results for + /// a row. If -1, there are no variable length slots. Never changes once set, can be + /// constant substituted with codegen. + int var_result_offset_; + }; + + ExprValuesCache* ALWAYS_INLINE expr_values_cache() { return &expr_values_cache_; } + + private: + friend class NewPartitionedHashTable; + friend class HashTableTest_HashEmpty_Test; + + /// Construct a hash table context. + /// - build_exprs are the exprs that should be used to evaluate rows during Insert(). + /// - probe_exprs are used during FindProbeRow() + /// - stores_nulls: if false, TupleRows with nulls are ignored during Insert + /// - finds_nulls: if finds_nulls[i] is false, FindProbeRow() returns End() for + /// TupleRows with nulls in position i even if stores_nulls is true. + /// - initial_seed: initial seed value to use when computing hashes for rows with + /// level 0. Other levels have their seeds derived from this seed. + /// - max_levels: the max lhashevels we will hash with. + /// - mem_pool: the MemPool which the expression evaluators allocate from. Owned by the + /// exec node which owns this hash table context. Memory usage of the expression + /// value cache is charged against its MemTracker. + /// + /// TODO: stores_nulls is too coarse: for a hash table in which some columns are joined + /// with '<=>' and others with '=', stores_nulls could distinguish between columns + /// in which nulls are stored and columns in which they are not, which could save + /// space by not storing some rows we know will never match. + NewPartitionedHashTableCtx(const std::vector& build_exprs, + const std::vector& probe_exprs, bool stores_nulls, + const std::vector& finds_nulls, int32_t initial_seed, + int max_levels, MemPool* mem_pool); + + /// Allocate various buffers for storing expression evaluation results, hash values, + /// null bits etc. Also allocate evaluators for the build and probe expressions and + /// store them in 'pool'. Returns error if allocation causes query memory limit to + /// be exceeded or the evaluators fail to initialize. 'num_build_tuples' is the number + /// of tuples of a row in the build side, used for computing the size of a scratch row. + Status Init(ObjectPool* pool, RuntimeState* state, int num_build_tuples, + MemTracker* tracker, const RowDescriptor& row_desc, const RowDescriptor& row_desc_probe); + + /// Compute the hash of the values in 'expr_values' with nullness 'expr_values_null'. + /// This will be replaced by codegen. We don't want this inlined for replacing + /// with codegen'd functions so the function name does not change. + uint32_t IR_NO_INLINE HashRow( + const uint8_t* expr_values, const uint8_t* expr_values_null) const noexcept; + + /// Wrapper function for calling correct HashUtil function in non-codegen'd case. + uint32_t Hash(const void* input, int len, uint32_t hash) const; + + /// Evaluate 'row' over build exprs, storing values into 'expr_values' and nullness into + /// 'expr_values_null'. This will be replaced by codegen. We do not want this function + /// inlined when cross compiled because we need to be able to differentiate between + /// EvalBuildRow and EvalProbeRow by name and the build/probe exprs are baked into the + /// codegen'd function. + bool IR_NO_INLINE EvalBuildRow( + TupleRow* row, uint8_t* expr_values, uint8_t* expr_values_null) noexcept { + return EvalRow(row, build_expr_evals_, expr_values, expr_values_null); + } + + /// Evaluate 'row' over probe exprs, storing the values into 'expr_values' and nullness + /// into 'expr_values_null'. This will be replaced by codegen. + bool IR_NO_INLINE EvalProbeRow( + TupleRow* row, uint8_t* expr_values, uint8_t* expr_values_null) noexcept { + return EvalRow(row, probe_expr_evals_, expr_values, expr_values_null); + } + + /// Compute the hash of the values in 'expr_values' with nullness 'expr_values_null' + /// for a row with variable length fields (e.g. strings). + uint32_t HashVariableLenRow( + const uint8_t* expr_values, const uint8_t* expr_values_null) const; + + /// Evaluate the exprs over row, storing the values into 'expr_values' and nullness into + /// 'expr_values_null'. Returns whether any expr evaluated to NULL. This will be + /// replaced by codegen. + bool EvalRow(TupleRow* row, const std::vector& ctxs, + uint8_t* expr_values, uint8_t* expr_values_null) noexcept; + + /// Returns true if the values of build_exprs evaluated over 'build_row' equal the + /// values in 'expr_values' with nullness 'expr_values_null'. FORCE_NULL_EQUALITY is + /// true if all nulls should be treated as equal, regardless of the values of + /// 'finds_nulls_'. This will be replaced by codegen. + template + bool IR_NO_INLINE Equals(TupleRow* build_row, const uint8_t* expr_values, + const uint8_t* expr_values_null) const noexcept; + + /// Helper function that calls Equals() with the current row. Always inlined so that + /// it does not appear in cross-compiled IR. + template + bool ALWAYS_INLINE Equals(TupleRow* build_row) const { + return Equals(build_row, expr_values_cache_.cur_expr_values(), + expr_values_cache_.cur_expr_values_null()); + } + + /// Cross-compiled function to access member variables used in CodegenHashRow(). + uint32_t IR_ALWAYS_INLINE GetHashSeed() const; + + /// Functions to be replaced by codegen to specialize the hash table. + bool IR_NO_INLINE stores_nulls() const { return stores_nulls_; } + bool IR_NO_INLINE finds_some_nulls() const { return finds_some_nulls_; } + + /// Cross-compiled function to access the build/probe expression context. + /// Called by generated LLVM IR functions such as Equals() and EvalRow(). + ExprContext* const* IR_ALWAYS_INLINE build_expr_evals() const; + ExprContext* const* IR_ALWAYS_INLINE probe_expr_evals() const; + + const std::vector& build_exprs_; + std::vector build_expr_evals_; + + const std::vector& probe_exprs_; + std::vector probe_expr_evals_; + + /// Constants on how the hash table should behave. Joins and aggs have slightly + /// different behavior. + const bool stores_nulls_; + const std::vector finds_nulls_; + + /// finds_some_nulls_ is just the logical OR of finds_nulls_. + const bool finds_some_nulls_; + + /// The current level this context is working on. Each level needs to use a + /// different seed. + int level_; + + /// The seeds to use for hashing. Indexed by the level. + std::vector seeds_; + + /// The ExprValuesCache for caching expression evaluation results, null bytes and hash + /// values for rows. Used to store results of batch evaluations of rows. + ExprValuesCache expr_values_cache_; + + /// Scratch buffer to generate rows on the fly. + TupleRow* scratch_row_; + + /// MemPool for 'build_expr_evals_' and 'probe_expr_evals_' to allocate expr-managed + /// memory from. Not owned. + MemPool* mem_pool_; +}; + +/// The hash table consists of a contiguous array of buckets that contain a pointer to the +/// data, the hash value and three flags: whether this bucket is filled, whether this +/// entry has been matched (used in right and full joins) and whether this entry has +/// duplicates. If there are duplicates, then the data is pointing to the head of a +/// linked list of duplicate nodes that point to the actual data. Note that the duplicate +/// nodes do not contain the hash value, because all the linked nodes have the same hash +/// value, the one in the bucket. The data is either a tuple stream index or a Tuple*. +/// This array of buckets is sparse, we are shooting for up to 3/4 fill factor (75%). The +/// data allocated by the hash table comes from the BufferPool. +class NewPartitionedHashTable { + private: + + /// Rows are represented as pointers into the BufferedTupleStream data with one + /// of two formats, depending on the number of tuples in the row. + union HtData { + // For rows with multiple tuples per row, a pointer to the flattened TupleRow. + BufferedTupleStream3::FlatRowPtr flat_row; + Tuple* tuple; + }; + + /// Linked list of entries used for duplicates. + struct DuplicateNode { + /// Used for full outer and right {outer, anti, semi} joins. Indicates whether the + /// row in the DuplicateNode has been matched. + /// From an abstraction point of view, this is an awkward place to store this + /// information. + /// TODO: Fold this flag in the next pointer below. + bool matched; + + /// Chain to next duplicate node, NULL when end of list. + DuplicateNode* next; + HtData htdata; + }; + + struct Bucket { + /// Whether this bucket contains a vaild entry, or it is empty. + bool filled; + + /// Used for full outer and right {outer, anti, semi} joins. Indicates whether the + /// row in the bucket has been matched. + /// From an abstraction point of view, this is an awkward place to store this + /// information but it is efficient. This space is otherwise unused. + bool matched; + + /// Used in case of duplicates. If true, then the bucketData union should be used as + /// 'duplicates'. + bool hasDuplicates; + + /// Cache of the hash for data. + /// TODO: Do we even have to cache the hash value? + uint32_t hash; + + /// Either the data for this bucket or the linked list of duplicates. + union { + HtData htdata; + DuplicateNode* duplicates; + } bucketData; + }; + + public: + class Iterator; + + /// Returns a newly allocated HashTable. The probing algorithm is set by the + /// FLAG_enable_quadratic_probing. + /// - allocator: allocator to allocate bucket directory and data pages from. + /// - stores_duplicates: true if rows with duplicate keys may be inserted into the + /// hash table. + /// - num_build_tuples: number of Tuples in the build tuple row. + /// - tuple_stream: the tuple stream which contains the tuple rows index by the + /// hash table. Can be NULL if the rows contain only a single tuple, in which + /// case the 'tuple_stream' is unused. + /// - max_num_buckets: the maximum number of buckets that can be stored. If we + /// try to grow the number of buckets to a larger number, the inserts will fail. + /// -1, if it unlimited. + /// - initial_num_buckets: number of buckets that the hash table should be initialized + /// with. + static NewPartitionedHashTable* Create(Suballocator* allocator, bool stores_duplicates, + int num_build_tuples, BufferedTupleStream3* tuple_stream, int64_t max_num_buckets, + int64_t initial_num_buckets); + + /// Allocates the initial bucket structure. Returns a non-OK status if an error is + /// encountered. If an OK status is returned , 'got_memory' is set to indicate whether + /// enough memory for the initial buckets was allocated from the Suballocator. + Status Init(bool* got_memory); + + /// Call to cleanup any resources. Must be called once. + void Close(); + + /// Inserts the row to the hash table. The caller is responsible for ensuring that the + /// table has free buckets. Returns true if the insertion was successful. Always + /// returns true if the table has free buckets and the key is not a duplicate. If the + /// key was a duplicate and memory could not be allocated for the new duplicate node, + /// returns false. If an error is encountered while creating a duplicate node, returns + /// false and sets 'status' to the error. + /// + /// 'flat_row' is a pointer to the flattened row in 'tuple_stream_' If the row contains + /// only one tuple, a pointer to that tuple is stored. Otherwise the 'flat_row' pointer + /// is stored. The 'row' is not copied by the hash table and the caller must guarantee + /// it stays in memory. This will not grow the hash table. + bool IR_ALWAYS_INLINE Insert(NewPartitionedHashTableCtx* ht_ctx, + BufferedTupleStream3::FlatRowPtr flat_row, TupleRow* row, + Status* status); + + /// Prefetch the hash table bucket which the given hash value 'hash' maps to. + template + void IR_ALWAYS_INLINE PrefetchBucket(uint32_t hash); + + /// Returns an iterator to the bucket that matches the probe expression results that + /// are cached at the current position of the ExprValuesCache in 'ht_ctx'. Assumes that + /// the ExprValuesCache was filled using EvalAndHashProbe(). Returns HashTable::End() + /// if no match is found. The iterator can be iterated until HashTable::End() to find + /// all the matching rows. Advancing the returned iterator will go to the next matching + /// row. The matching rows do not need to be evaluated since all the nodes of a bucket + /// are duplicates. One scan can be in progress for each 'ht_ctx'. Used in the probe + /// phase of hash joins. + Iterator IR_ALWAYS_INLINE FindProbeRow(NewPartitionedHashTableCtx* ht_ctx); + + /// If a match is found in the table, return an iterator as in FindProbeRow(). If a + /// match was not present, return an iterator pointing to the empty bucket where the key + /// should be inserted. Returns End() if the table is full. The caller can set the data + /// in the bucket using a Set*() method on the iterator. + Iterator IR_ALWAYS_INLINE FindBuildRowBucket(NewPartitionedHashTableCtx* ht_ctx, bool* found); + + /// Returns number of elements inserted in the hash table + int64_t size() const { + return num_filled_buckets_ - num_buckets_with_duplicates_ + num_duplicate_nodes_; + } + + /// Returns the number of empty buckets. + int64_t EmptyBuckets() const { return num_buckets_ - num_filled_buckets_; } + + /// Returns the number of buckets + int64_t num_buckets() const { return num_buckets_; } + + /// Returns the load factor (the number of non-empty buckets) + double load_factor() const { + return static_cast(num_filled_buckets_) / num_buckets_; + } + + /// Return an estimate of the number of bytes needed to build the hash table + /// structure for 'num_rows'. To do that, it estimates the number of buckets, + /// rounded up to a power of two, and also assumes that there are no duplicates. + static int64_t EstimateNumBuckets(int64_t num_rows) { + /// Assume max 66% fill factor and no duplicates. + return BitUtil::next_power_of_two(3 * num_rows / 2); + } + static int64_t EstimateSize(int64_t num_rows) { + int64_t num_buckets = EstimateNumBuckets(num_rows); + return num_buckets * sizeof(Bucket); + } + + /// Return the size of a hash table bucket in bytes. + static int64_t BucketSize() { return sizeof(Bucket); } + + /// Returns the memory occupied by the hash table, takes into account the number of + /// duplicates. + int64_t CurrentMemSize() const; + + /// Returns the number of inserts that can be performed before resizing the table. + int64_t NumInsertsBeforeResize() const; + + /// Calculates the fill factor if 'buckets_to_fill' additional buckets were to be + /// filled and resizes the hash table so that the projected fill factor is below the + /// max fill factor. + /// If 'got_memory' is true, then it is guaranteed at least 'rows_to_add' rows can be + /// inserted without need to resize. If there is not enough memory available to + /// resize the hash table, Status::OK() is returned and 'got_memory' is false. If a + /// another error occurs, an error status may be returned. + Status CheckAndResize(uint64_t buckets_to_fill, const NewPartitionedHashTableCtx* ht_ctx, + bool* got_memory); + + /// Returns the number of bytes allocated to the hash table from the block manager. + int64_t ByteSize() const { + return num_buckets_ * sizeof(Bucket) + total_data_page_size_; + } + + /// Returns an iterator at the beginning of the hash table. Advancing this iterator + /// will traverse all elements. + Iterator Begin(const NewPartitionedHashTableCtx* ht_ctx); + + /// Return an iterator pointing to the first element (Bucket or DuplicateNode, if the + /// bucket has duplicates) in the hash table that does not have its matched flag set. + /// Used in right joins and full-outer joins. + Iterator FirstUnmatched(NewPartitionedHashTableCtx* ctx); + + /// Return true if there was a least one match. + bool HasMatches() const { return has_matches_; } + + /// Return end marker. + Iterator End() { return Iterator(); } + + /// Dump out the entire hash table to string. If 'skip_empty', empty buckets are + /// skipped. If 'show_match', it also prints the matched flag of each node. If + /// 'build_desc' is non-null, the build rows will be printed. Otherwise, only the + /// the addresses of the build rows will be printed. + std::string DebugString(bool skip_empty, bool show_match, + const RowDescriptor* build_desc); + + /// Print the content of a bucket or node. + void DebugStringTuple(std::stringstream& ss, HtData& htdata, const RowDescriptor* desc); + + /// Update and print some statistics that can be used for performance debugging. + std::string PrintStats() const; + + /// Number of hash collisions so far in the lifetime of this object + int64_t NumHashCollisions() const { return num_hash_collisions_; } + + /// stl-like iterator interface. + class Iterator { + private: + /// Bucket index value when probe is not successful. + static const int64_t BUCKET_NOT_FOUND = -1; + + public: + IR_ALWAYS_INLINE Iterator() : + table_(NULL), + scratch_row_(NULL), + bucket_idx_(BUCKET_NOT_FOUND), + node_(NULL) { } + + /// Iterates to the next element. It should be called only if !AtEnd(). + void IR_ALWAYS_INLINE Next(); + + /// Iterates to the next duplicate node. If the bucket does not have duplicates or + /// when it reaches the last duplicate node, then it moves the Iterator to AtEnd(). + /// Used when we want to iterate over all the duplicate nodes bypassing the Next() + /// interface (e.g. in semi/outer joins without other_join_conjuncts, in order to + /// iterate over all nodes of an unmatched bucket). + void IR_ALWAYS_INLINE NextDuplicate(); + + /// Iterates to the next element that does not have its matched flag set. Used in + /// right-outer and full-outer joins. + void IR_ALWAYS_INLINE NextUnmatched(); + + /// Return the current row or tuple. Callers must check the iterator is not AtEnd() + /// before calling them. The returned row is owned by the iterator and valid until + /// the next call to GetRow(). It is safe to advance the iterator. + TupleRow* IR_ALWAYS_INLINE GetRow() const; + Tuple* IR_ALWAYS_INLINE GetTuple() const; + + /// Set the current tuple for an empty bucket. Designed to be used with the iterator + /// returned from FindBuildRowBucket() in the case when the value is not found. It is + /// not valid to call this function if the bucket already has an entry. + void SetTuple(Tuple* tuple, uint32_t hash); + + /// Sets as matched the Bucket or DuplicateNode currently pointed by the iterator, + /// depending on whether the bucket has duplicates or not. The iterator cannot be + /// AtEnd(). + void SetMatched(); + + /// Returns the 'matched' flag of the current Bucket or DuplicateNode, depending on + /// whether the bucket has duplicates or not. It should be called only if !AtEnd(). + bool IsMatched() const; + + /// Resets everything but the pointer to the hash table. + void SetAtEnd(); + + /// Returns true if this iterator is at the end, i.e. GetRow() cannot be called. + bool ALWAYS_INLINE AtEnd() const { return bucket_idx_ == BUCKET_NOT_FOUND; } + + /// Prefetch the hash table bucket which the iterator is pointing to now. + template + void IR_ALWAYS_INLINE PrefetchBucket(); + + private: + friend class NewPartitionedHashTable; + + ALWAYS_INLINE + Iterator(NewPartitionedHashTable* table, TupleRow* row, int bucket_idx, DuplicateNode* node) + : table_(table), + scratch_row_(row), + bucket_idx_(bucket_idx), + node_(node) { + } + + NewPartitionedHashTable* table_; + + /// Scratch buffer to hold generated rows. Not owned. + TupleRow* scratch_row_; + + /// Current bucket idx. + int64_t bucket_idx_; + + /// Pointer to the current duplicate node. + DuplicateNode* node_; + }; + + private: + friend class Iterator; + friend class HashTableTest; + + /// Hash table constructor. Private because Create() should be used, instead + /// of calling this constructor directly. + /// - quadratic_probing: set to true when the probing algorithm is quadratic, as + /// opposed to linear. + NewPartitionedHashTable(bool quadratic_probing, Suballocator* allocator, bool stores_duplicates, + int num_build_tuples, BufferedTupleStream3* tuple_stream, int64_t max_num_buckets, + int64_t initial_num_buckets); + + /// Performs the probing operation according to the probing algorithm (linear or + /// quadratic. Returns one of the following: + /// (a) the index of the bucket that contains the entry that matches with the last row + /// evaluated in 'ht_ctx'. If 'ht_ctx' is NULL then it does not check for row + /// equality and returns the index of the first empty bucket. + /// (b) the index of the first empty bucket according to the probing algorithm (linear + /// or quadratic), if the entry is not in the hash table or 'ht_ctx' is NULL. + /// (c) Iterator::BUCKET_NOT_FOUND if the probe was not successful, i.e. the maximum + /// distance was traveled without finding either an empty or a matching bucket. + /// Using the returned index value, the caller can create an iterator that can be + /// iterated until End() to find all the matching rows. + /// + /// EvalAndHashBuild() or EvalAndHashProbe() must have been called before calling + /// this function. The values of the expression values cache in 'ht_ctx' will be + /// used to probe the hash table. + /// + /// 'FORCE_NULL_EQUALITY' is true if NULLs should always be considered equal when + /// comparing two rows. + /// + /// 'hash' is the hash computed by EvalAndHashBuild() or EvalAndHashProbe(). + /// 'found' indicates that a bucket that contains an equal row is found. + /// + /// There are wrappers of this function that perform the Find and Insert logic. + template + int64_t IR_ALWAYS_INLINE Probe(Bucket* buckets, int64_t num_buckets, + NewPartitionedHashTableCtx* ht_ctx, uint32_t hash, bool* found); + + /// Performs the insert logic. Returns the HtData* of the bucket or duplicate node + /// where the data should be inserted. Returns NULL if the insert was not successful + /// and either sets 'status' to OK if it failed because not enough reservation was + /// available or the error if an error was encountered. + HtData* IR_ALWAYS_INLINE InsertInternal(NewPartitionedHashTableCtx* ht_ctx, Status* status); + + /// Updates 'bucket_idx' to the index of the next non-empty bucket. If the bucket has + /// duplicates, 'node' will be pointing to the head of the linked list of duplicates. + /// Otherwise, 'node' should not be used. If there are no more buckets, sets + /// 'bucket_idx' to BUCKET_NOT_FOUND. + void NextFilledBucket(int64_t* bucket_idx, DuplicateNode** node); + + /// Resize the hash table to 'num_buckets'. 'got_memory' is false on OOM. + Status ResizeBuckets(int64_t num_buckets, const NewPartitionedHashTableCtx* ht_ctx, bool* got_memory); + + /// Appends the DuplicateNode pointed by next_node_ to 'bucket' and moves the next_node_ + /// pointer to the next DuplicateNode in the page, updating the remaining node counter. + DuplicateNode* IR_ALWAYS_INLINE AppendNextNode(Bucket* bucket); + + /// Creates a new DuplicateNode for a entry and chains it to the bucket with index + /// 'bucket_idx'. The duplicate nodes of a bucket are chained as a linked list. + /// This places the new duplicate node at the beginning of the list. If this is the + /// first duplicate entry inserted in this bucket, then the entry already contained by + /// the bucket is converted to a DuplicateNode. That is, the contents of 'data' of the + /// bucket are copied to a DuplicateNode and 'data' is updated to pointing to a + /// DuplicateNode. + /// Returns NULL and sets 'status' to OK if the node array could not grow, i.e. there + /// was not enough memory to allocate a new DuplicateNode. Returns NULL and sets + /// 'status' to an error if another error was encountered. + DuplicateNode* IR_ALWAYS_INLINE InsertDuplicateNode(int64_t bucket_idx, Status* status); + + /// Resets the contents of the empty bucket with index 'bucket_idx', in preparation for + /// an insert. Sets all the fields of the bucket other than 'data'. + void IR_ALWAYS_INLINE PrepareBucketForInsert(int64_t bucket_idx, uint32_t hash); + + /// Return the TupleRow pointed by 'htdata'. + TupleRow* GetRow(HtData& htdata, TupleRow* row) const; + + /// Returns the TupleRow of the pointed 'bucket'. In case of duplicates, it + /// returns the content of the first chained duplicate node of the bucket. + TupleRow* GetRow(Bucket* bucket, TupleRow* row) const; + + /// Grow the node array. Returns true and sets 'status' to OK on success. Returns false + /// and set 'status' to OK if we can't get sufficient reservation to allocate the next + /// data page. Returns false and sets 'status' if another error is encountered. + bool GrowNodeArray(Status* status); + + /// Functions to be replaced by codegen to specialize the hash table. + bool IR_NO_INLINE stores_tuples() const { return stores_tuples_; } + bool IR_NO_INLINE stores_duplicates() const { return stores_duplicates_; } + bool IR_NO_INLINE quadratic_probing() const { return quadratic_probing_; } + + /// Load factor that will trigger growing the hash table on insert. This is + /// defined as the number of non-empty buckets / total_buckets + static constexpr double MAX_FILL_FACTOR = 0.75; + + /// The size in bytes of each page of duplicate nodes. Should be large enough to fit + /// enough DuplicateNodes to amortise the overhead of allocating each page and low + /// enough to not waste excessive memory to internal fragmentation. + static constexpr int64_t DATA_PAGE_SIZE = 64L * 1024; + + RuntimeState* state_; + + /// Suballocator to allocate data pages and hash table buckets with. + Suballocator* allocator_; + + /// Stream contains the rows referenced by the hash table. Can be NULL if the + /// row only contains a single tuple, in which case the TupleRow indirection + /// is removed by the hash table. + BufferedTupleStream3* tuple_stream_; + + /// Constants on how the hash table should behave. + + /// True if the HtData uses the Tuple* representation, or false if it uses FlatRowPtr. + const bool stores_tuples_; + + /// True if duplicates may be inserted into hash table. + const bool stores_duplicates_; + + /// Quadratic probing enabled (as opposed to linear). + const bool quadratic_probing_; + + /// Data pages for all nodes. Allocated from suballocator to reduce memory + /// consumption of small tables. + std::vector> data_pages_; + + /// Byte size of all buffers in data_pages_. + int64_t total_data_page_size_; + + /// Next duplicate node to insert. Vaild when node_remaining_current_page_ > 0. + DuplicateNode* next_node_; + + /// Number of nodes left in the current page. + int node_remaining_current_page_; + + /// Number of duplicate nodes. + int64_t num_duplicate_nodes_; + + const int64_t max_num_buckets_; + + /// Allocation containing all buckets. + std::unique_ptr bucket_allocation_; + + /// Pointer to the 'buckets_' array from 'bucket_allocation_'. + Bucket* buckets_; + + /// Total number of buckets (filled and empty). + int64_t num_buckets_; + + /// Number of non-empty buckets. Used to determine when to resize. + int64_t num_filled_buckets_; + + /// Number of (non-empty) buckets with duplicates. These buckets do not point to slots + /// in the tuple stream, rather than to a linked list of Nodes. + int64_t num_buckets_with_duplicates_; + + /// Number of build tuples, used for constructing temp row* for probes. + const int num_build_tuples_; + + /// Flag used to check that we don't lose stored matches when spilling hash tables + /// (IMPALA-1488). + bool has_matches_; + + /// The stats below can be used for debugging perf. + /// TODO: Should we make these statistics atomic? + /// Number of FindProbeRow(), Insert(), or FindBuildRowBucket() calls that probe the + /// hash table. + int64_t num_probes_; + + /// Number of probes that failed and had to fall back to linear probing without cap. + int64_t num_failed_probes_; + + /// Total distance traveled for each probe. That is the sum of the diff between the end + /// position of a probe (find/insert) and its start position + /// (hash & (num_buckets_ - 1)). + int64_t travel_length_; + + /// The number of cases where we had to compare buckets with the same hash value, but + /// the row equality failed. + int64_t num_hash_collisions_; + + /// How many times this table has resized so far. + int64_t num_resizes_; +}; + +} + +#endif + diff --git a/be/src/exec/new_partitioned_hash_table.inline.h b/be/src/exec/new_partitioned_hash_table.inline.h new file mode 100644 index 0000000000..748f7be5a7 --- /dev/null +++ b/be/src/exec/new_partitioned_hash_table.inline.h @@ -0,0 +1,411 @@ +// Modifications copyright (C) 2017, Baidu.com, Inc. +// Copyright 2017 The Apache Software Foundation + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef BDG_PALO_BE_SRC_EXEC_NEW_PARTITIONED_HASH_TABLE_INLINE_H +#define BDG_PALO_BE_SRC_EXEC_NEW_PARTITIONED_HASH_TABLE_INLINE_H + +#include "exec/new_partitioned_hash_table.h" + +#include "exprs/expr.h" +#include "exprs/expr_context.h" + +namespace palo { + +inline bool NewPartitionedHashTableCtx::EvalAndHashBuild(TupleRow* row) { + uint8_t* expr_values = expr_values_cache_.cur_expr_values(); + uint8_t* expr_values_null = expr_values_cache_.cur_expr_values_null(); + bool has_null = EvalBuildRow(row, expr_values, expr_values_null); + if (!stores_nulls() && has_null) return false; + expr_values_cache_.SetCurExprValuesHash(HashRow(expr_values, expr_values_null)); + return true; +} + +inline bool NewPartitionedHashTableCtx::EvalAndHashProbe(TupleRow* row) { + uint8_t* expr_values = expr_values_cache_.cur_expr_values(); + uint8_t* expr_values_null = expr_values_cache_.cur_expr_values_null(); + bool has_null = EvalProbeRow(row, expr_values, expr_values_null); + if (has_null && !(stores_nulls() && finds_some_nulls())) return false; + expr_values_cache_.SetCurExprValuesHash(HashRow(expr_values, expr_values_null)); + return true; +} + +inline void NewPartitionedHashTableCtx::ExprValuesCache::NextRow() { + cur_expr_values_ += expr_values_bytes_per_row_; + cur_expr_values_null_ += num_exprs_; + ++cur_expr_values_hash_; + DCHECK_LE(cur_expr_values_hash_ - expr_values_hash_array_.get(), capacity_); +} + +template +inline int64_t NewPartitionedHashTable::Probe(Bucket* buckets, int64_t num_buckets, + NewPartitionedHashTableCtx* ht_ctx, uint32_t hash, bool* found) { + DCHECK(buckets != NULL); + DCHECK_GT(num_buckets, 0); + *found = false; + int64_t bucket_idx = hash & (num_buckets - 1); + + // In case of linear probing it counts the total number of steps for statistics and + // for knowing when to exit the loop (e.g. by capping the total travel length). In case + // of quadratic probing it is also used for calculating the length of the next jump. + int64_t step = 0; + do { + Bucket* bucket = &buckets[bucket_idx]; + if (LIKELY(!bucket->filled)) return bucket_idx; + if (hash == bucket->hash) { + if (ht_ctx != NULL && + ht_ctx->Equals(GetRow(bucket, ht_ctx->scratch_row_))) { + *found = true; + return bucket_idx; + } + // Row equality failed, or not performed. This is a hash collision. Continue + // searching. + ++num_hash_collisions_; + } + // Move to the next bucket. + ++step; + ++travel_length_; + if (quadratic_probing()) { + // The i-th probe location is idx = (hash + (step * (step + 1)) / 2) mod num_buckets. + // This gives num_buckets unique idxs (between 0 and N-1) when num_buckets is a power + // of 2. + bucket_idx = (bucket_idx + step) & (num_buckets - 1); + } else { + bucket_idx = (bucket_idx + 1) & (num_buckets - 1); + } + } while (LIKELY(step < num_buckets)); + DCHECK_EQ(num_filled_buckets_, num_buckets) << "Probing of a non-full table " + << "failed: " << quadratic_probing() << " " << hash; + return Iterator::BUCKET_NOT_FOUND; +} + +inline NewPartitionedHashTable::HtData* NewPartitionedHashTable::InsertInternal( + NewPartitionedHashTableCtx* ht_ctx, Status* status) { + ++num_probes_; + bool found = false; + uint32_t hash = ht_ctx->expr_values_cache()->CurExprValuesHash(); + int64_t bucket_idx = Probe(buckets_, num_buckets_, ht_ctx, hash, &found); + DCHECK_NE(bucket_idx, Iterator::BUCKET_NOT_FOUND); + if (found) { + // We need to insert a duplicate node, note that this may fail to allocate memory. + DuplicateNode* new_node = InsertDuplicateNode(bucket_idx, status); + if (UNLIKELY(new_node == NULL)) return NULL; + return &new_node->htdata; + } else { + PrepareBucketForInsert(bucket_idx, hash); + return &buckets_[bucket_idx].bucketData.htdata; + } +} + +inline bool NewPartitionedHashTable::Insert(NewPartitionedHashTableCtx* ht_ctx, + BufferedTupleStream3::FlatRowPtr flat_row, TupleRow* row, Status* status) { + HtData* htdata = InsertInternal(ht_ctx, status); + // If successful insert, update the contents of the newly inserted entry with 'idx'. + if (LIKELY(htdata != NULL)) { + if (stores_tuples()) { + htdata->tuple = row->get_tuple(0); + } else { + htdata->flat_row = flat_row; + } + return true; + } + return false; +} + +template +inline void NewPartitionedHashTable::PrefetchBucket(uint32_t hash) { + int64_t bucket_idx = hash & (num_buckets_ - 1); + // Two optional arguments: + // 'rw': 1 means the memory access is write + // 'locality': 0-3. 0 means no temporal locality. 3 means high temporal locality. + // On x86, they map to instructions prefetchnta and prefetch{2-0} respectively. + // TODO: Reconsider the locality level with smaller prefetch batch size. + __builtin_prefetch(&buckets_[bucket_idx], READ ? 0 : 1, 1); +} + +inline NewPartitionedHashTable::Iterator NewPartitionedHashTable::FindProbeRow( + NewPartitionedHashTableCtx* ht_ctx) { + ++num_probes_; + bool found = false; + uint32_t hash = ht_ctx->expr_values_cache()->CurExprValuesHash(); + int64_t bucket_idx = Probe(buckets_, num_buckets_, ht_ctx, hash, &found); + if (found) { + return Iterator(this, ht_ctx->scratch_row(), bucket_idx, + stores_duplicates() ? buckets_[bucket_idx].bucketData.duplicates : NULL); + } + return End(); +} + +// TODO: support lazy evaluation like HashTable::Insert(). +inline NewPartitionedHashTable::Iterator NewPartitionedHashTable::FindBuildRowBucket( + NewPartitionedHashTableCtx* ht_ctx, bool* found) { + ++num_probes_; + uint32_t hash = ht_ctx->expr_values_cache()->CurExprValuesHash(); + int64_t bucket_idx = Probe(buckets_, num_buckets_, ht_ctx, hash, found); + DuplicateNode* duplicates = NULL; + if (stores_duplicates() && LIKELY(bucket_idx != Iterator::BUCKET_NOT_FOUND)) { + duplicates = buckets_[bucket_idx].bucketData.duplicates; + } + return Iterator(this, ht_ctx->scratch_row(), bucket_idx, duplicates); +} + +inline NewPartitionedHashTable::Iterator NewPartitionedHashTable::Begin( + const NewPartitionedHashTableCtx* ctx) { + int64_t bucket_idx = Iterator::BUCKET_NOT_FOUND; + DuplicateNode* node = NULL; + NextFilledBucket(&bucket_idx, &node); + return Iterator(this, ctx->scratch_row(), bucket_idx, node); +} + +inline NewPartitionedHashTable::Iterator NewPartitionedHashTable::FirstUnmatched( + NewPartitionedHashTableCtx* ctx) { + int64_t bucket_idx = Iterator::BUCKET_NOT_FOUND; + DuplicateNode* node = NULL; + NextFilledBucket(&bucket_idx, &node); + Iterator it(this, ctx->scratch_row(), bucket_idx, node); + // Check whether the bucket, or its first duplicate node, is matched. If it is not + // matched, then return. Otherwise, move to the first unmatched entry (node or bucket). + Bucket* bucket = &buckets_[bucket_idx]; + bool has_duplicates = stores_duplicates() && bucket->hasDuplicates; + if ((!has_duplicates && bucket->matched) || (has_duplicates && node->matched)) { + it.NextUnmatched(); + } + return it; +} + +inline void NewPartitionedHashTable::NextFilledBucket(int64_t* bucket_idx, DuplicateNode** node) { + ++*bucket_idx; + for (; *bucket_idx < num_buckets_; ++*bucket_idx) { + if (buckets_[*bucket_idx].filled) { + *node = stores_duplicates() ? buckets_[*bucket_idx].bucketData.duplicates : NULL; + return; + } + } + // Reached the end of the hash table. + *bucket_idx = Iterator::BUCKET_NOT_FOUND; + *node = NULL; +} + +inline void NewPartitionedHashTable::PrepareBucketForInsert(int64_t bucket_idx, uint32_t hash) { + DCHECK_GE(bucket_idx, 0); + DCHECK_LT(bucket_idx, num_buckets_); + Bucket* bucket = &buckets_[bucket_idx]; + DCHECK(!bucket->filled); + ++num_filled_buckets_; + bucket->filled = true; + bucket->matched = false; + bucket->hasDuplicates = false; + bucket->hash = hash; +} + +inline NewPartitionedHashTable::DuplicateNode* NewPartitionedHashTable::AppendNextNode(Bucket* bucket) { + DCHECK_GT(node_remaining_current_page_, 0); + bucket->bucketData.duplicates = next_node_; + ++num_duplicate_nodes_; + --node_remaining_current_page_; + return next_node_++; +} + +inline NewPartitionedHashTable::DuplicateNode* NewPartitionedHashTable::InsertDuplicateNode( + int64_t bucket_idx, Status* status) { + DCHECK_GE(bucket_idx, 0); + DCHECK_LT(bucket_idx, num_buckets_); + Bucket* bucket = &buckets_[bucket_idx]; + DCHECK(bucket->filled); + DCHECK(stores_duplicates()); + // Allocate one duplicate node for the new data and one for the preexisting data, + // if needed. + while (node_remaining_current_page_ < 1 + !bucket->hasDuplicates) { + if (UNLIKELY(!GrowNodeArray(status))) return NULL; + } + if (!bucket->hasDuplicates) { + // This is the first duplicate in this bucket. It means that we need to convert + // the current entry in the bucket to a node and link it from the bucket. + next_node_->htdata.flat_row = bucket->bucketData.htdata.flat_row; + DCHECK(!bucket->matched); + next_node_->matched = false; + next_node_->next = NULL; + AppendNextNode(bucket); + bucket->hasDuplicates = true; + ++num_buckets_with_duplicates_; + } + // Link a new node. + next_node_->next = bucket->bucketData.duplicates; + next_node_->matched = false; + return AppendNextNode(bucket); +} + +inline TupleRow* IR_ALWAYS_INLINE NewPartitionedHashTable::GetRow(HtData& htdata, TupleRow* row) const { + if (stores_tuples()) { + return reinterpret_cast(&htdata.tuple); + } else { + // TODO: GetTupleRow() has interpreted code that iterates over the row's descriptor. + tuple_stream_->GetTupleRow(htdata.flat_row, row); + return row; + } +} + +inline TupleRow* IR_ALWAYS_INLINE NewPartitionedHashTable::GetRow(Bucket* bucket, TupleRow* row) const { + DCHECK(bucket != NULL); + if (UNLIKELY(stores_duplicates() && bucket->hasDuplicates)) { + DuplicateNode* duplicate = bucket->bucketData.duplicates; + DCHECK(duplicate != NULL); + return GetRow(duplicate->htdata, row); + } else { + return GetRow(bucket->bucketData.htdata, row); + } +} + +inline TupleRow* IR_ALWAYS_INLINE NewPartitionedHashTable::Iterator::GetRow() const { + DCHECK(!AtEnd()); + DCHECK(table_ != NULL); + DCHECK(scratch_row_ != NULL); + Bucket* bucket = &table_->buckets_[bucket_idx_]; + if (UNLIKELY(table_->stores_duplicates() && bucket->hasDuplicates)) { + DCHECK(node_ != NULL); + return table_->GetRow(node_->htdata, scratch_row_); + } else { + return table_->GetRow(bucket->bucketData.htdata, scratch_row_); + } +} + +inline Tuple* IR_ALWAYS_INLINE NewPartitionedHashTable::Iterator::GetTuple() const { + DCHECK(!AtEnd()); + DCHECK(table_->stores_tuples()); + Bucket* bucket = &table_->buckets_[bucket_idx_]; + // TODO: To avoid the hasDuplicates check, store the HtData* in the Iterator. + if (UNLIKELY(table_->stores_duplicates() && bucket->hasDuplicates)) { + DCHECK(node_ != NULL); + return node_->htdata.tuple; + } else { + return bucket->bucketData.htdata.tuple; + } +} + +inline void NewPartitionedHashTable::Iterator::SetTuple(Tuple* tuple, uint32_t hash) { + DCHECK(!AtEnd()); + DCHECK(table_->stores_tuples()); + table_->PrepareBucketForInsert(bucket_idx_, hash); + table_->buckets_[bucket_idx_].bucketData.htdata.tuple = tuple; +} + +inline void NewPartitionedHashTable::Iterator::SetMatched() { + DCHECK(!AtEnd()); + Bucket* bucket = &table_->buckets_[bucket_idx_]; + if (table_->stores_duplicates() && bucket->hasDuplicates) { + node_->matched = true; + } else { + bucket->matched = true; + } + // Used for disabling spilling of hash tables in right and full-outer joins with + // matches. See IMPALA-1488. + table_->has_matches_ = true; +} + +inline bool NewPartitionedHashTable::Iterator::IsMatched() const { + DCHECK(!AtEnd()); + Bucket* bucket = &table_->buckets_[bucket_idx_]; + if (table_->stores_duplicates() && bucket->hasDuplicates) { + return node_->matched; + } + return bucket->matched; +} + +inline void NewPartitionedHashTable::Iterator::SetAtEnd() { + bucket_idx_ = BUCKET_NOT_FOUND; + node_ = NULL; +} + +template +inline void NewPartitionedHashTable::Iterator::PrefetchBucket() { + if (LIKELY(!AtEnd())) { + // HashTable::PrefetchBucket() takes a hash value to index into the hash bucket + // array. Passing 'bucket_idx_' here is sufficient. + DCHECK_EQ((bucket_idx_ & ~(table_->num_buckets_ - 1)), 0); + table_->PrefetchBucket(bucket_idx_); + } +} + +inline void NewPartitionedHashTable::Iterator::Next() { + DCHECK(!AtEnd()); + if (table_->stores_duplicates() && table_->buckets_[bucket_idx_].hasDuplicates && + node_->next != NULL) { + node_ = node_->next; + } else { + table_->NextFilledBucket(&bucket_idx_, &node_); + } +} + +inline void NewPartitionedHashTable::Iterator::NextDuplicate() { + DCHECK(!AtEnd()); + if (table_->stores_duplicates() && table_->buckets_[bucket_idx_].hasDuplicates && + node_->next != NULL) { + node_ = node_->next; + } else { + bucket_idx_ = BUCKET_NOT_FOUND; + node_ = NULL; + } +} + +inline void NewPartitionedHashTable::Iterator::NextUnmatched() { + DCHECK(!AtEnd()); + Bucket* bucket = &table_->buckets_[bucket_idx_]; + // Check if there is any remaining unmatched duplicate node in the current bucket. + if (table_->stores_duplicates() && bucket->hasDuplicates) { + while (node_->next != NULL) { + node_ = node_->next; + if (!node_->matched) return; + } + } + // Move to the next filled bucket and return if this bucket is not matched or + // iterate to the first not matched duplicate node. + table_->NextFilledBucket(&bucket_idx_, &node_); + while (bucket_idx_ != Iterator::BUCKET_NOT_FOUND) { + bucket = &table_->buckets_[bucket_idx_]; + if (!table_->stores_duplicates() || !bucket->hasDuplicates) { + if (!bucket->matched) return; + } else { + while (node_->matched && node_->next != NULL) { + node_ = node_->next; + } + if (!node_->matched) return; + } + table_->NextFilledBucket(&bucket_idx_, &node_); + } +} + +inline void NewPartitionedHashTableCtx::set_level(int level) { + DCHECK_GE(level, 0); + DCHECK_LT(level, seeds_.size()); + level_ = level; +} + +inline int64_t NewPartitionedHashTable::CurrentMemSize() const { + return num_buckets_ * sizeof(Bucket) + num_duplicate_nodes_ * sizeof(DuplicateNode); +} + +inline int64_t NewPartitionedHashTable::NumInsertsBeforeResize() const { + return std::max( + 0, static_cast(num_buckets_ * MAX_FILL_FACTOR) - num_filled_buckets_); +} + +} + +#endif + diff --git a/be/src/exec/new_partitioned_hash_table_ir.cc b/be/src/exec/new_partitioned_hash_table_ir.cc new file mode 100644 index 0000000000..5662d78b2b --- /dev/null +++ b/be/src/exec/new_partitioned_hash_table_ir.cc @@ -0,0 +1,38 @@ +// Modifications copyright (C) 2017, Baidu.com, Inc. +// Copyright 2017 The Apache Software Foundation + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifdef IR_COMPILE + +#include "exec/new_partitioned_hash_table.h" + +using namespace palo; + +uint32_t NewPartitionedHashTableCtx::GetHashSeed() const { return seeds_[level_]; } + +ExprContext* const* NewPartitionedHashTableCtx::build_expr_evals() const { + return build_expr_evals_.data(); +} + +ExprContext* const* NewPartitionedHashTableCtx::probe_expr_evals() const { + return probe_expr_evals_.data(); +} + +#endif + diff --git a/be/src/exec/olap_common.h b/be/src/exec/olap_common.h index 4212daf4a9..9a4091d9d0 100644 --- a/be/src/exec/olap_common.h +++ b/be/src/exec/olap_common.h @@ -130,7 +130,7 @@ public: TCondition high; if (_type_max != _high_value || FILTER_LESS_OR_EQUAL != _high_op) { high.__set_column_name(_column_name); - high.__set_condition_op((FILTER_LESS_OR_EQUAL ? "<=" : "<<")); + high.__set_condition_op((_high_op == FILTER_LESS_OR_EQUAL ? "<=" : "<<")); high.condition_values.push_back(cast_to_string(_high_value)); } diff --git a/be/src/exec/olap_rewrite_node.cpp b/be/src/exec/olap_rewrite_node.cpp index 64e49dc5ac..424feb25b5 100644 --- a/be/src/exec/olap_rewrite_node.cpp +++ b/be/src/exec/olap_rewrite_node.cpp @@ -36,8 +36,8 @@ OlapRewriteNode::OlapRewriteNode(ObjectPool* pool, _child_eos(false) { } -Status OlapRewriteNode::init(const TPlanNode& tnode) { - RETURN_IF_ERROR(ExecNode::init(tnode)); +Status OlapRewriteNode::init(const TPlanNode& tnode, RuntimeState* state) { + RETURN_IF_ERROR(ExecNode::init(tnode, state)); DCHECK(tnode.__isset.olap_rewrite_node); // create columns RETURN_IF_ERROR(Expr::create_expr_trees( diff --git a/be/src/exec/olap_rewrite_node.h b/be/src/exec/olap_rewrite_node.h index 69bb244421..8e0722e372 100644 --- a/be/src/exec/olap_rewrite_node.h +++ b/be/src/exec/olap_rewrite_node.h @@ -31,7 +31,7 @@ class OlapRewriteNode : public ExecNode { public: OlapRewriteNode(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs); - virtual Status init(const TPlanNode& tnode); + virtual Status init(const TPlanNode& tnode, RuntimeState* state = nullptr); virtual ~OlapRewriteNode() { } virtual Status prepare(RuntimeState* state); virtual Status open(RuntimeState* state); diff --git a/be/src/exec/olap_scan_node.cpp b/be/src/exec/olap_scan_node.cpp index 4def6f8864..c05d746da8 100644 --- a/be/src/exec/olap_scan_node.cpp +++ b/be/src/exec/olap_scan_node.cpp @@ -49,7 +49,6 @@ namespace palo { OlapScanNode::OlapScanNode(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs): ScanNode(pool, tnode, descs), - _thrift_plan_node(new TPlanNode(tnode)), _tuple_id(tnode.olap_scan_node.tuple_id), _olap_scan_node(tnode.olap_scan_node), _tuple_desc(NULL), @@ -60,7 +59,6 @@ OlapScanNode::OlapScanNode(ObjectPool* pool, const TPlanNode& tnode, const Descr _start(false), _scanner_done(false), _transfer_done(false), - _use_pushdown_conjuncts(true), _wait_duration(0, 0, 1, 0), _status(Status::OK), _resource_info(nullptr), @@ -72,37 +70,81 @@ OlapScanNode::OlapScanNode(ObjectPool* pool, const TPlanNode& tnode, const Descr OlapScanNode::~OlapScanNode() { } -Status OlapScanNode::init(const TPlanNode& tnode) { - RETURN_IF_ERROR(ExecNode::init(tnode)); +Status OlapScanNode::init(const TPlanNode& tnode, RuntimeState* state) { + RETURN_IF_ERROR(ExecNode::init(tnode, state)); _direct_conjunct_size = _conjunct_ctxs.size(); if (tnode.olap_scan_node.__isset.sort_column) { - _sort_column = tnode.olap_scan_node.sort_column; _is_result_order = true; - LOG(INFO) << "SortColumn: " << _sort_column; } else { _is_result_order = false; } + + // Before, we support scan data ordered, but is not used in production + // Now, we drop this functional + DCHECK(!_is_result_order) << "ordered result don't support any more"; + return Status::OK; } +void OlapScanNode::_init_counter(RuntimeState* state) { +#if 0 + ADD_TIMER(profile, "GetTabletTime"); + ADD_TIMER(profile, "InitReaderTime"); + ADD_TIMER(profile, "ShowHintsTime"); + ADD_TIMER(profile, "BlockLoadTime"); + ADD_TIMER(profile, "IndexLoadTime"); + ADD_TIMER(profile, "VectorPredicateEvalTime"); + ADD_TIMER(profile, "ScannerTimer"); + ADD_TIMER(profile, "IOTimer"); + ADD_TIMER(profile, "DecompressorTimer"); + ADD_TIMER(profile, "RLETimer"); + + ADD_COUNTER(profile, "RawRowsRead", TUnit::UNIT); + ADD_COUNTER(profile, "IndexStreamCacheMiss", TUnit::UNIT); + ADD_COUNTER(profile, "IndexStreamCacheHit", TUnit::UNIT); + ADD_COUNTER(profile, "BlockLoadCount", TUnit::UNIT); +#endif + ADD_TIMER(_runtime_profile, "ShowHintsTime"); + + _read_compressed_counter = + ADD_COUNTER(_runtime_profile, "CompressedBytesRead", TUnit::BYTES); + _read_uncompressed_counter = + ADD_COUNTER(_runtime_profile, "UncompressedBytesRead", TUnit::BYTES); + _block_load_timer = ADD_TIMER(_runtime_profile, "BlockLoadTime"); + _block_load_counter = + ADD_COUNTER(_runtime_profile, "BlocksLoad", TUnit::UNIT); + _block_fetch_timer = + ADD_TIMER(_runtime_profile, "BlockFetchTime"); + _raw_rows_counter = + ADD_COUNTER(_runtime_profile, "RawRowsRead", TUnit::UNIT); + + _rows_vec_cond_counter = + ADD_COUNTER(_runtime_profile, "RowsVectorPredFiltered", TUnit::UNIT); + _vec_cond_timer = + ADD_TIMER(_runtime_profile, "VectorPredEvalTime"); + + _stats_filtered_counter = + ADD_COUNTER(_runtime_profile, "RowsStatsFiltered", TUnit::UNIT); + _del_filtered_counter = + ADD_COUNTER(_runtime_profile, "RowsDelFiltered", TUnit::UNIT); + + _io_timer = ADD_TIMER(_runtime_profile, "IOTimer"); + _decompressor_timer = ADD_TIMER(_runtime_profile, "DecompressorTimer"); + _index_load_timer = ADD_TIMER(_runtime_profile, "IndexLoadTime"); + + _scan_timer = ADD_TIMER(_runtime_profile, "ScanTime"); +} + Status OlapScanNode::prepare(RuntimeState* state) { RETURN_IF_ERROR(ScanNode::prepare(state)); // create scanner profile - _scanner_profile = state->obj_pool()->add( - new RuntimeProfile(state->obj_pool(), "OlapScanner")); - _runtime_profile->add_child(_scanner_profile, true, nullptr); - OLAPReader::init_profile(_scanner_profile); // create timer - _olap_thread_scan_timer = ADD_TIMER(_runtime_profile, "OlapScanTime"); - _eval_timer = ADD_TIMER(_runtime_profile, "EvalTime"); - _merge_timer = ADD_TIMER(_runtime_profile, "SortMergeTime"); - _pushdown_return_counter = - ADD_COUNTER(runtime_profile(), "PushDownFilterReturnCount ", TUnit::UNIT); - _direct_return_counter = - ADD_COUNTER(runtime_profile(), "DirectFilterReturnCount ", TUnit::UNIT); _tablet_counter = ADD_COUNTER(runtime_profile(), "TabletCount ", TUnit::UNIT); + _rows_pushed_cond_filtered_counter = + ADD_COUNTER(_runtime_profile, "RowsPushedCondFiltered", TUnit::UNIT); + _init_counter(state); _tuple_desc = state->desc_tbl().get_tuple_descriptor(_tuple_id); if (_tuple_desc == NULL) { @@ -169,7 +211,9 @@ Status OlapScanNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eo boost::unique_lock l(_row_batches_lock); _transfer_done = true; boost::lock_guard guard(_status_mutex); - _status = Status::CANCELLED; + if (LIKELY(_status.ok())) { + _status = Status::CANCELLED; + } return _status; } @@ -291,23 +335,6 @@ Status OlapScanNode::close(RuntimeState* state) { _scan_row_batches.clear(); - if (_is_result_order) { - for (int i = 0; i < _merge_rowbatches.size(); ++i) { - for (std::list::iterator it = _merge_rowbatches[i].begin(); - it != _merge_rowbatches[i].end(); ++it) { - delete *it; - } - } - - _merge_rowbatches.clear(); - - for (int i = 0; i < _backup_rowbatches.size(); ++i) { - if (_backup_rowbatches[i] != NULL) { - delete _backup_rowbatches[i]; - } - } - } - // OlapScanNode terminate by exception // so that initiative close the Scanner for (auto scanner : _all_olap_scanners) { @@ -315,11 +342,11 @@ Status OlapScanNode::close(RuntimeState* state) { } VLOG(1) << "OlapScanNode::close()"; - return ExecNode::close(state); + return ScanNode::close(state); } Status OlapScanNode::set_scan_ranges(const std::vector& scan_ranges) { - BOOST_FOREACH(const TScanRangeParams & scan_range, scan_ranges) { + for (auto& scan_range : scan_ranges) { DCHECK(scan_range.scan_range.__isset.palo_scan_range); boost::shared_ptr palo_scan_range( new PaloScanRange(scan_range.scan_range.palo_scan_range)); @@ -515,15 +542,10 @@ Status OlapScanNode::build_scan_key() { DCHECK(column_types.size() == column_names.size()); // 1. construct scan key except last olap engine short key - int order_column_index = -1; int column_index = 0; _scan_keys.set_is_convertible(limit() == -1); for (; column_index < column_names.size() && !_scan_keys.has_range_value(); ++column_index) { - if (_is_result_order && _sort_column == column_names[column_index]) { - order_column_index = column_index; - } - std::map::iterator column_range_iter = _column_value_ranges.find(column_names[column_index]); @@ -535,11 +557,6 @@ Status OlapScanNode::build_scan_key() { RETURN_IF_ERROR(boost::apply_visitor(visitor, column_range_iter->second)); } - // 3. check order column - if (_is_result_order && order_column_index == -1) { - return Status("OlapScanNode unsupport order by " + _sort_column); - } - _scan_keys.debug(); return Status::OK; @@ -591,26 +608,18 @@ Status OlapScanNode::start_scan_thread(RuntimeState* state) { key_ranges.push_back(_query_key_ranges[i]); ++i; - if (!_is_result_order) { - for (int j = 1; - j < key_range_num_per_scanner - && i < key_range_size - && _query_scan_ranges[i] == _query_scan_ranges[i - 1] - && _query_key_ranges[i].end_include == _query_key_ranges[i - 1].end_include; - j++, i++) { - key_ranges.push_back(_query_key_ranges[i]); - } + for (int j = 1; + j < key_range_num_per_scanner + && i < key_range_size + && _query_scan_ranges[i] == _query_scan_ranges[i - 1] + && _query_key_ranges[i].end_include == _query_key_ranges[i - 1].end_include; + j++, i++) { + key_ranges.push_back(_query_key_ranges[i]); } OlapScanner* scanner = new OlapScanner( - state, - scan_range, - key_ranges, - _olap_filter, - *_tuple_desc, - _scanner_profile, - _is_null_vector); - scanner->set_aggregation(_olap_scan_node.is_preaggregation); + state, this, _olap_scan_node.is_preaggregation, + scan_range.get(), key_ranges); _scanner_pool->add(scanner); _olap_scanners.push_back(scanner); @@ -623,69 +632,9 @@ Status OlapScanNode::start_scan_thread(RuntimeState* state) { _progress = ProgressUpdater(ss.str(), _olap_scanners.size(), 1); _progress.set_logging_level(1); - if (_is_result_order) { - _transfer_thread.add_thread( - new boost::thread( - &OlapScanNode::merge_transfer_thread, this, state)); - } else { - _transfer_thread.add_thread( - new boost::thread( - &OlapScanNode::transfer_thread, this, state)); - } - - return Status::OK; -} - -Status OlapScanNode::create_conjunct_ctxs( - RuntimeState* state, - std::vector* row_ctxs, - std::vector* vec_ctxs, - bool disable_codegen) { - _direct_row_conjunct_size = -1; - _direct_vec_conjunct_size = -1; -#if 0 - for (int i = 0; i < _conjunct_ctxs.size(); ++i) { - if (/* _conjunct_ctxs[i]->is_vectorized() */false) { - vec_expr->emplace_back(); - RETURN_IF_ERROR(Expr::copy_expr(_runtime_state->obj_pool(), - _conjunct_ctxs[i], - &vec_expr->back())); - RETURN_IF_ERROR(Expr::prepare(vec_expr->back(), - _runtime_state, - row_desc(), - disable_codegen)); - if (i >= _direct_conjunct_size) { - vec_expr->back()->prepare_r(); - if (-1 == _direct_vec_conjunct_size) { - _direct_vec_conjunct_size = vec_expr->size() - 1; - } - } - } else { - row_expr->emplace_back(); - RETURN_IF_ERROR(Expr::copy_expr(_runtime_state->obj_pool(), - _conjunct_ctxs[i], - &row_expr->back())); - RETURN_IF_ERROR(Expr::prepare(row_expr->back(), - _runtime_state, - row_desc(), - disable_codegen)); - if (i >= _direct_conjunct_size) { - row_expr->back()->prepare_r(); - if (-1 == _direct_row_conjunct_size) { - _direct_row_conjunct_size = row_expr->size() - 1; - } - } - } - } -#endif - RETURN_IF_ERROR(Expr::clone_if_not_exists(_conjunct_ctxs, state, row_ctxs)); - - if (-1 == _direct_vec_conjunct_size) { - _direct_vec_conjunct_size = vec_ctxs->size(); - } - if (-1 == _direct_row_conjunct_size) { - _direct_row_conjunct_size = row_ctxs->size(); - } + _transfer_thread.add_thread( + new boost::thread( + &OlapScanNode::transfer_thread, this, state)); return Status::OK; } @@ -1012,9 +961,8 @@ Status OlapScanNode::get_sub_scan_range( std::vector scan_key_range; RETURN_IF_ERROR(_scan_keys.get_key_range(&scan_key_range)); - if (_is_result_order || - limit() != -1 || - scan_key_range.size() > 64) { + if (limit() != -1 || + scan_key_range.size() > 64) { if (scan_key_range.size() != 0) { *sub_range = scan_key_range; } else { // [-oo, +oo] @@ -1030,7 +978,7 @@ Status OlapScanNode::get_sub_scan_range( _scan_keys.end_include(), scan_key_range, sub_range, - _scanner_profile).ok()) { + _runtime_profile.get()).ok()) { if (scan_key_range.size() != 0) { *sub_range = scan_key_range; } else { // [-oo, +oo] @@ -1044,324 +992,11 @@ Status OlapScanNode::get_sub_scan_range( return Status::OK; } -Status OlapScanNode::transfer_open_scanners(RuntimeState* state) { - Status status = Status::OK; - std::list::iterator iter = _olap_scanners.begin(); - - for (int i = 0; iter != _olap_scanners.end(); ++iter, ++i) { - // 1.1 open each scanner - status = (*iter)->open(); - - if (!status.ok()) { - return status; - } - - status = create_conjunct_ctxs( - state, (*iter)->row_conjunct_ctxs(), (*iter)->vec_conjunct_ctxs(), true); - if (!status.ok()) { - return status; - } - - // 1.2 init result array - (*iter)->set_id(i); - } - - return status; -} - -TransferStatus OlapScanNode::read_row_batch(RuntimeState* state) { - // Get a RowBatch from the scanner which need to be read - RowBatch* scan_batch = NULL; - int cur_id = 0; - - while (true) { - boost::unique_lock l(_scan_batches_lock); - - if (UNLIKELY(_transfer_done)) { - while (!_scanner_done) { - _scan_batch_added_cv.wait(l); - } - - return FININSH; - } - - while (LIKELY(!_scan_row_batches.empty())) { - scan_batch = dynamic_cast(_scan_row_batches.front()); - _scan_row_batches.pop_front(); - DCHECK(scan_batch != NULL); - - // push RowBatch into scanner result array - VLOG(1) << "Push RowBatch " << scan_batch->scanner_id(); - _merge_rowbatches[scan_batch->scanner_id()].push_back(scan_batch); - } - - if (-1 == _merge_scanner_id) { - for (; cur_id < _merge_rowbatches.size(); ++cur_id) { - if (_merge_rowbatches[cur_id].empty()) { - // this scanner has finished - if (_fin_olap_scanners[cur_id] == NULL) { - break; - } - } - } - - if (cur_id == _merge_rowbatches.size()) { - return INIT_HEAP; - } - } else { - if (_merge_rowbatches[_merge_scanner_id].empty() - && _fin_olap_scanners[_merge_scanner_id] != NULL) { - _scanner_fin_flags[_merge_scanner_id] = true; - return MERGE; - } else if (!_merge_rowbatches[_merge_scanner_id].empty()) { - return MERGE; - } - } - - if (!_olap_scanners.empty()) { - std::list::iterator iter = _olap_scanners.begin(); - - while (iter != _olap_scanners.end()) { - if (-1 == _merge_scanner_id || _merge_scanner_id == (*iter)->id()) { - PriorityThreadPool::Task task; - task.work_function = boost::bind(&OlapScanNode::scanner_thread, this, *iter); - task.priority = _nice; - if (state->exec_env()->thread_pool()->offer(task)) { - _olap_scanners.erase(iter++); - } else { - LOG(FATAL) << "Failed to assign scanner task to thread pool!"; - } - } else { - ++iter; - } - ++_total_assign_num; - } - // scanner_row_num = 16k - // 16k * 10 * 12 * 8 = 15M(>2s) --> nice=10 - // 16k * 20 * 22 * 8 = 55M(>6s) --> nice=0 - while (_nice > 0 - && _total_assign_num > (22 - _nice) * (20 - _nice) * 6) { - --_nice; - } - } - - // 2.2 wait when all scanner are running & no result in queue - int completed = _progress.num_complete(); - while (completed == _progress.num_complete() - && _scan_row_batches.empty() - && !_scanner_done) { - _scan_batch_added_cv.wait(l); - } - } -} - -TransferStatus OlapScanNode::init_merge_heap(Heap& heap) { - for (int i = 0; i < _merge_rowbatches.size(); ++i) { - if (!_merge_rowbatches[i].empty()) { - Tuple* tuple = _merge_rowbatches[i].front()->get_row( - _merge_row_idxs[i])->get_tuple(_tuple_idx); - - if (VLOG_ROW_IS_ON) { - VLOG_ROW << "SortMerge input row: " << print_tuple(tuple, *_tuple_desc); - } - - ++_merge_row_idxs[i]; - HeapType v; - v.tuple = tuple; - v.id = i; - heap.push(v); - } - } - - return BUILD_ROWBATCH; -} - -TransferStatus OlapScanNode::build_row_batch(RuntimeState* state) { - // _merge_rowbatch = new RowBatch(this->row_desc(), state->batch_size(), mem_tracker()); - _merge_rowbatch = new RowBatch( - this->row_desc(), state->batch_size(), state->fragment_mem_tracker()); - uint8_t* tuple_buf = _merge_rowbatch->tuple_data_pool()->allocate( - state->batch_size() * _tuple_desc->byte_size()); - DCHECK(tuple_buf != NULL); - //bzero(tuple_buf, state->batch_size() * _tuple_desc->byte_size()); - _merge_tuple = reinterpret_cast(tuple_buf); - return MERGE; -} - -TransferStatus OlapScanNode::sorted_merge(Heap& heap) { - ScopedTimer merge_timer(_merge_timer); - - while (true) { - if (heap.empty()) { - return FININSH; - } - - // 1. Break if RowBatch is Full, Try to read new RowBatch - if (_merge_rowbatch->is_full()) { - return ADD_ROWBATCH; - } - - // 2. Check top tuple's scanner has rowbatch in result vec - HeapType v = heap.top(); - Tuple* pop_tuple = v.tuple; - _merge_scanner_id = v.id; - - if (!_scanner_fin_flags[_merge_scanner_id]) { - if (_merge_row_idxs[_merge_scanner_id] - >= _merge_rowbatches[_merge_scanner_id].front()->num_rows()) { - if (_backup_rowbatches[_merge_scanner_id] != NULL) { - delete _backup_rowbatches[_merge_scanner_id]; - } - - _backup_rowbatches[_merge_scanner_id] - = _merge_rowbatches[_merge_scanner_id].front(); - - VLOG(1) << "Pop RowBatch " << _merge_scanner_id; - _merge_rowbatches[_merge_scanner_id].pop_front(); - _merge_row_idxs[_merge_scanner_id] = 0; - } - - if (_merge_rowbatches[_merge_scanner_id].empty()) { - return READ_ROWBATCH; - } - } - - pop_tuple->deep_copy(_merge_tuple, *_tuple_desc, _merge_rowbatch->tuple_data_pool(), false); - // 3. Get top tuple of heap and push into new rowbatch - heap.pop(); - - if (VLOG_ROW_IS_ON) { - VLOG_ROW << "SortMerge output row: " << print_tuple(_merge_tuple, *_tuple_desc); - } - - int row_idx = _merge_rowbatch->add_row(); - TupleRow* row = _merge_rowbatch->get_row(row_idx); - row->set_tuple(_tuple_idx, _merge_tuple); - _merge_rowbatch->commit_last_row(); - - char* new_tuple = reinterpret_cast(_merge_tuple); - new_tuple += _tuple_desc->byte_size(); - _merge_tuple = reinterpret_cast(new_tuple); - - // 4. push scanner's next tuple into heap - if (!_scanner_fin_flags[_merge_scanner_id]) { - Tuple* push_tuple = _merge_rowbatches[_merge_scanner_id].front()->get_row( - _merge_row_idxs[_merge_scanner_id])->get_tuple(_tuple_idx); - ++_merge_row_idxs[_merge_scanner_id]; - - v.tuple = push_tuple; - - if (VLOG_ROW_IS_ON) { - VLOG_ROW << "SortMerge input row: " << print_tuple(v.tuple, *_tuple_desc); - } - - heap.push(v); - } - } -} - -void OlapScanNode::merge_transfer_thread(RuntimeState* state) { - // 1. Prepare to Start MergeTransferThread - VLOG(1) << "MergeTransferThread Start."; - Status status = Status::OK; - - // 1.1 scanner open - status = transfer_open_scanners(state); - - // 1.2 find sort column - const std::vector& slots = _tuple_desc->slots(); - int i = 0; - - while (i < slots.size()) { - if (slots[i]->col_name() == _sort_column) { - VLOG(1) << "Sort Slot: " << slots[i]->debug_string(); - break; - } - - ++i; - } - - if (i >= slots.size()) { - status = Status("Counldn't find sort column"); - } - - // 2. Merge ScannerThread' result - if (status.ok()) { - // 2.1 init data structure - _merge_scanner_id = -1; - Heap heap(MergeComparison(get_compare_func(slots[i]->type().type), slots[i]->tuple_offset())); - - _merge_rowbatches.resize(_olap_scanners.size()); - _merge_row_idxs.resize(_olap_scanners.size(), 0); - _fin_olap_scanners.resize(_olap_scanners.size(), NULL); - _scanner_fin_flags.resize(_olap_scanners.size(), false); - _backup_rowbatches.resize(_olap_scanners.size(), NULL); - _total_assign_num = 0; - _nice = 20; - - // 2.2 read from scanner and order by _sort_column - TransferStatus transfer_status = READ_ROWBATCH; - bool flag = true; - - // 1. read one row_batch from each scanner - // 2. use each row_batch' first tuple_row to build heap - // 3. pop one tuple_row & push one tuple_row from same row_batch - // 3.1 if row_batch is empty, read one row_batch from corresponding scanner - // 3.2 if scanner is finish, just pop the tuple_row without push - // 4. finish when heap is empty - while (flag) { - switch (transfer_status) { - case INIT_HEAP: - transfer_status = init_merge_heap(heap); - break; - - case READ_ROWBATCH: - transfer_status = read_row_batch(state); - break; - - case BUILD_ROWBATCH: - transfer_status = build_row_batch(state); - break; - - case MERGE: - transfer_status = sorted_merge(heap); - break; - - case ADD_ROWBATCH: - add_one_batch(_merge_rowbatch); - transfer_status = BUILD_ROWBATCH; - break; - - case FININSH: - add_one_batch(_merge_rowbatch); - flag = false; - break; - - default: - DCHECK(false); - break; - } - } - } else { - boost::lock_guard guard(_status_mutex); - _status = status; - } - - VLOG(1) << "MergeTransferThread finish."; - boost::unique_lock l(_row_batches_lock); - _transfer_done = true; - _row_batch_added_cv.notify_all(); -} - void OlapScanNode::transfer_thread(RuntimeState* state) { - Status status = Status::OK; - // scanner open pushdown to scanThread - std::list::iterator iter = _olap_scanners.begin(); - - for (; iter != _olap_scanners.end(); ++iter) { - status = create_conjunct_ctxs( - state, (*iter)->row_conjunct_ctxs(), (*iter)->vec_conjunct_ctxs(), true); + Status status = Status::OK; + for (auto scanner : _olap_scanners) { + status = Expr::clone_if_not_exists(_conjunct_ctxs, state, scanner->conjunct_ctxs()); if (!status.ok()) { boost::lock_guard guard(_status_mutex); _status = status; @@ -1436,7 +1071,7 @@ void OlapScanNode::transfer_thread(RuntimeState* state) { } } - iter = olap_scanners.begin(); + auto iter = olap_scanners.begin(); while (iter != olap_scanners.end()) { PriorityThreadPool::Task task; task.work_function = boost::bind(&OlapScanNode::scanner_thread, this, *iter); @@ -1519,164 +1154,42 @@ void OlapScanNode::scanner_thread(OlapScanner* scanner) { } std::vector row_batchs; - std::vector* row_conjunct_ctxs = scanner->row_conjunct_ctxs(); - //std::vector* vec_conjunct_ctxs = scanner->vec_conjunct_ctxs(); - bool _use_pushdown_conjuncts = true; - int64_t total_rows_reader_counter = 0; - while (!eos && total_rows_reader_counter < config::palo_scanner_row_num) { - // 1. Allocate one row batch - // RowBatch *row_batch = new RowBatch(this->row_desc(), state->batch_size(), mem_tracker()); + // Because we use thread pool to scan data from storage. One scanner can't + // use this thread too long, this can starve other query's scanner. So, we + // need yield this thread when we do enough work. However, OlapStorage read + // data in pre-aggregate mode, then we can't use storage returned data to + // judge if we need to yield. So we record all raw data read in this round + // scan, if this exceed threshold, we yield this thread. + int64_t raw_rows_read = scanner->raw_rows_read(); + int64_t raw_rows_threshold = raw_rows_read + config::palo_scanner_row_num; + while (!eos && raw_rows_read < raw_rows_threshold) { + if (UNLIKELY(_transfer_done)) { + eos = true; + status = Status::CANCELLED; + LOG(INFO) << "Scan thread cancelled, cause query done, maybe reach limit."; + break; + } RowBatch *row_batch = new RowBatch( this->row_desc(), state->batch_size(), _runtime_state->fragment_mem_tracker()); row_batch->set_scanner_id(scanner->id()); - // 2. Allocate Row's Tuple buf - uint8_t *tuple_buf = row_batch->tuple_data_pool()->allocate( - state->batch_size() * _tuple_desc->byte_size()); - bzero(tuple_buf, state->batch_size() * _tuple_desc->byte_size()); - Tuple *tuple = reinterpret_cast(tuple_buf); - - int direct_return_counter = 0; - int pushdown_return_counter = 0; - int rows_read_counter = 0; - // 3. Read data to each tuple - while (true) { - // 3.1 Break if RowBatch is Full, Try to read new RowBatch - if (row_batch->is_full()) { - break; - } - // 3.2 Stoped if Scanner has been cancelled - if (UNLIKELY(_transfer_done)) { - eos = true; - status = Status::CANCELLED; - LOG(INFO) << "Scan thread cancelled, " - "cause query done, maybe reach limit."; - break; - } - // 3.3 Read tuple from OlapEngine - status = scanner->get_next(tuple, &total_rows_reader_counter, &eos); - if (UNLIKELY(!status.ok())) { - LOG(ERROR) << "Scan thread read OlapScanner failed!"; - eos = true; - break; - } - if (UNLIKELY(eos)) { - // this scanner read all data, break; - break; - } - - if (VLOG_ROW_IS_ON) { - VLOG_ROW << "OlapScanner input row: " << print_tuple(tuple, *_tuple_desc); - } - // 3.4 Set tuple to RowBatch(not commited) - int row_idx = row_batch->add_row(); - TupleRow* row = row_batch->get_row(row_idx); - row->set_tuple(_tuple_idx, tuple); - - do { - // SCOPED_TIMER(_eval_timer); - - // 3.5.1 Using direct conjuncts to filter data - if (_eval_conjuncts_fn != NULL) { - if (!_eval_conjuncts_fn(&((*row_conjunct_ctxs)[0]), _direct_row_conjunct_size, row)) { - // check direct conjuncts fail then clear tuple for reuse - // make sure to reset null indicators since we're overwriting - // the tuple assembled for the previous row - tuple->init(_tuple_desc->byte_size()); - break; - } - } else { - if (!eval_conjuncts(&((*row_conjunct_ctxs)[0]), _direct_row_conjunct_size, row)) { - // check direct conjuncts fail then clear tuple for reuse - // make sure to reset null indicators since we're overwriting - // the tuple assembled for the previous row - tuple->init(_tuple_desc->byte_size()); - break; - } - } - - - ++direct_return_counter; - - // 3.5.2 Using pushdown conjuncts to filter data - if (_use_pushdown_conjuncts - && row_conjunct_ctxs->size() > _direct_conjunct_size) { - if (!eval_conjuncts(&((*row_conjunct_ctxs)[_direct_conjunct_size]), - row_conjunct_ctxs->size() - _direct_conjunct_size, row)) { - // check pushdown conjuncts fail then clear tuple for reuse - // make sure to reset null indicators since we're overwriting - // the tuple assembled for the previous row - tuple->init(_tuple_desc->byte_size()); - - break; - } - } - - int string_slots_size = _string_slots.size(); - for (int i = 0; i < string_slots_size; ++i) { - StringValue* slot = tuple->get_string_slot(_string_slots[i]->tuple_offset()); - if (0 != slot->len) { - uint8_t* v = row_batch->tuple_data_pool()->allocate(slot->len); - memory_copy(v, slot->ptr, slot->len); - slot->ptr = reinterpret_cast(v); - } - } - - if (VLOG_ROW_IS_ON) { - VLOG_ROW << "OlapScanner output row: " << print_tuple(tuple, *_tuple_desc); - } - - // check direct && pushdown conjuncts success then commit tuple - row_batch->commit_last_row(); - char* new_tuple = reinterpret_cast(tuple); - new_tuple += _tuple_desc->byte_size(); - tuple = reinterpret_cast(new_tuple); - - ++pushdown_return_counter; - } while (0); - - ++rows_read_counter; - if (total_rows_reader_counter >= config::palo_scanner_row_num) { - break; - } + status = scanner->get_batch(_runtime_state, row_batch, &eos); + if (!status.ok()) { + LOG(WARNING) << "Scan thread read OlapScanner failed!"; + eos = true; + break; } - - - COUNTER_UPDATE(_pushdown_return_counter, pushdown_return_counter); - COUNTER_UPDATE(_direct_return_counter, direct_return_counter); - COUNTER_UPDATE(this->rows_read_counter(), rows_read_counter); - // 4. if status not ok, change status_. - if (UNLIKELY(0 == row_batch->num_rows())) { + if (UNLIKELY(row_batch->num_rows() == 0)) { // may be failed, push already, scan node delete this batch. delete row_batch; row_batch = NULL; } else { - // compute pushdown conjuncts filter rate - if (_use_pushdown_conjuncts) { - int32_t pushdown_return_rate - = _pushdown_return_counter->value() * 100 / _direct_return_counter->value(); - if (pushdown_return_rate > config::palo_max_pushdown_conjuncts_return_rate) { - _use_pushdown_conjuncts = false; - VLOG(2) << "Stop Using PushDown Conjuncts. " - << "PushDownReturnRate: " << pushdown_return_rate << "%" - << " MaxPushDownReturnRate: " - << config::palo_max_pushdown_conjuncts_return_rate << "%"; - } else { - //VLOG(1) << "PushDownReturnRate: " << pushdown_return_rate << "%"; - } - } row_batchs.push_back(row_batch); __sync_fetch_and_add(&_buffered_bytes, row_batch->tuple_data_pool()->total_reserved_bytes()); } - } - - - // update raw rows number readed from tablet - RuntimeProfile::Counter* raw_rows_counter = _scanner_profile->get_counter("RawRowsRead"); - if (raw_rows_counter != NULL) { - COUNTER_UPDATE(raw_rows_counter, total_rows_reader_counter); + raw_rows_read = scanner->raw_rows_read(); } boost::unique_lock l(_scan_batches_lock); @@ -1684,7 +1197,9 @@ void OlapScanNode::scanner_thread(OlapScanner* scanner) { if (UNLIKELY(!status.ok())) { _transfer_done = true; boost::lock_guard guard(_status_mutex); - _status = status; + if (LIKELY(_status.ok())) { + _status = status; + } } bool global_status_ok = false; @@ -1694,11 +1209,11 @@ void OlapScanNode::scanner_thread(OlapScanner* scanner) { } if (UNLIKELY(!global_status_ok)) { eos = true; - BOOST_FOREACH(RowBatch* rb, row_batchs) { + for (auto rb : row_batchs) { delete rb; } } else { - BOOST_FOREACH(RowBatch* rb, row_batchs) { + for (auto rb : row_batchs) { _scan_row_batches.push_back(rb); } } @@ -1709,9 +1224,7 @@ void OlapScanNode::scanner_thread(OlapScanner* scanner) { // this is the right out _scanner_done = true; } - if (_is_result_order) { - _fin_olap_scanners[scanner->id()] = scanner; - } + scanner->close(_runtime_state); } else { _olap_scanners.push_front(scanner); } @@ -1719,224 +1232,6 @@ void OlapScanNode::scanner_thread(OlapScanner* scanner) { _scan_batch_added_cv.notify_one(); } -#if 0 -void OlapScanNode::vectorized_scanner_thread(OlapScanner* scanner) { - Status status = Status::OK; - std::vector row_batchs; - std::vector* row_conjunct_ctxs = scanner->row_conjunct_ctxs(); - std::vector* vec_conjunct_ctxs = scanner->vec_conjunct_ctxs(); - RuntimeState* state = scanner->runtime_state(); - DCHECK(NULL != state); - - // read from scanner - int total_rows_reader_counter = 0; - bool eos = false; - bool _use_pushdown_conjuncts = true; - DCHECK_GE(row_desc().tuple_descriptors().size(), 1); - std::shared_ptr vectorized_row_batch( - new VectorizedRowBatch(*(row_desc().tuple_descriptors()[0]), 1024)); - //vectorized_row_batch->mem_pool()->set_limits(*state->mem_trackers()); - - do { - // 1. Allocate one row batch - RowBatch* row_batch = new RowBatch(row_desc(), state->batch_size()); - row_batch->tuple_data_pool()->set_limits(*state->mem_trackers()); - row_batch->set_scanner_id(scanner->id()); - - // 3. Read data to each tuple - while (true) { - // 3.1 Break if RowBatch is Full, Try to read new RowBatch - if (row_batch->is_full()) { - break; - } - - // 3.2 Stoped if Scanner has been cancelled - if (UNLIKELY(_transfer_done)) { - eos = true; - status = Status::CANCELLED; - LOG(INFO) << "Scan thread cancelled, " - "cause query done, maybe reach limit."; - break; - } - - // 3.3 Read vectorized_row_batch from OlapEngine - if (vectorized_row_batch->is_iterator_end()) { - if (total_rows_reader_counter >= config::palo_scanner_row_num) { - break; - } - status = scanner->get_next(vectorized_row_batch.get(), &eos); - if (UNLIKELY(!status.ok())) { - LOG(ERROR) << "Scan thread read OlapScanner failed!"; - eos = true; - break; - } - if (UNLIKELY(eos)) { - // this scanner read all data, break; - LOG(INFO) << "Scan thread read OlapScanner finish."; - break; - } - COUNTER_UPDATE(rows_read_counter(), vectorized_row_batch->num_rows()); - total_rows_reader_counter += vectorized_row_batch->num_rows(); - - eval_vectorized_conjuncts(vectorized_row_batch, vec_conjunct_ctxs); - } - - if (row_conjunct_ctxs->size() > 0) { - eval_row_based_conjuncts(vectorized_row_batch, row_batch, row_conjunct_ctxs); - } else { - vectorized_row_batch->to_row_batch(row_batch); - } - - if (VLOG_ROW_IS_ON) { - for (int i = 0; i < row_batch->num_rows(); ++i) { - TupleRow* row = row_batch->get_row(i); - VLOG_ROW << "VectorizedScannerThread ouput row: " << print_row(row, row_desc()); - } - } - } - - // 4. if status not ok, change _status. - if (UNLIKELY(0 == row_batch->num_rows())) { - // may be failed, push already, scan node delete this batch. - delete row_batch; - row_batch = NULL; - } else { - // compute pushdown conjuncts filter rate - if (_use_pushdown_conjuncts) { - int32_t pushdown_return_rate - = _pushdown_return_counter->value() * 100 / _direct_return_counter->value(); - if (pushdown_return_rate > config::palo_max_pushdown_conjuncts_return_rate) { - _use_pushdown_conjuncts = false; - VLOG(2) << "Stop Using PushDown Conjuncts. " - << "PushDownReturnRate: " << pushdown_return_rate << "%" - << " MaxPushDownReturnRate: " - << config::palo_max_pushdown_conjuncts_return_rate << "%"; - } else { - VLOG(2) << "PushDownReturnRate: " << pushdown_return_rate << "%"; - } - } - row_batchs.push_back(row_batch); - } - } while ((total_rows_reader_counter < config::palo_scanner_row_num - || !vectorized_row_batch->is_iterator_end()) - && !eos); - - boost::unique_lock l(_scan_batches_lock); - // if we failed, check status. - if (UNLIKELY(!status.ok())) { - _transfer_done = true; - _status = status; - } - if (UNLIKELY(!_status.ok())) { - eos = true; - BOOST_FOREACH(RowBatch* rb, row_batchs) { - delete rb; - } - } else { - BOOST_FOREACH(RowBatch* rb, row_batchs) { - _scan_row_batches.push_back(rb); - } - } - // Scanner thread completed. Take a look and update the status - if (UNLIKELY(eos)) { - _progress.update(1); - if (_progress.done()) { - // this is the right out - _scanner_done = true; - } - if (_is_result_order) { - _fin_olap_scanners[scanner->id()] = scanner; - } - } else { - _olap_scanners.push_front(scanner); - } - _scan_batch_added_cv.notify_one(); -} - -void OlapScanNode::eval_vectorized_conjuncts( - std::shared_ptr vectorized_row_batch, - std::vector* vec_conjunct_ctxs) { - for (int i = 0; i < _direct_vec_conjunct_size; ++i) { - (*vec_conjunct_ctxs)[i]->evaluate(vectorized_row_batch.get()); - } - COUNTER_UPDATE(_direct_return_counter, vectorized_row_batch->num_rows()); - - if (_use_pushdown_conjuncts) { - for (int i = _direct_vec_conjunct_size; i < vec_conjunct_ctxs->size(); ++i) { - (*vec_conjunct_ctxs)[i]->evaluate(vectorized_row_batch.get()); - } - } - COUNTER_UPDATE(_pushdown_return_counter, vectorized_row_batch->num_rows()); -} - - -void OlapScanNode::eval_row_based_conjuncts( - std::shared_ptr vectorized_row_batch, - RowBatch* row_batch, - std::vector* row_conjunct_ctxs) { - int row_remain = row_batch->capacity() - row_batch->num_rows(); - uint8_t* tuple_buf = row_batch->tuple_data_pool()->allocate( - row_remain * _tuple_desc->byte_size()); - bzero(tuple_buf, row_remain * _tuple_desc->byte_size()); - Tuple* tuple = reinterpret_cast(tuple_buf); - - while (vectorized_row_batch->get_next_tuple(tuple)) { - int row_idx = row_batch->add_row(); - TupleRow* row = row_batch->get_row(row_idx); - row->set_tuple(_tuple_idx, tuple); - - do { - // 3.5.1 Using direct conjuncts to filter data - if (!eval_conjuncts(&((*row_conjunct_ctxs)[0]), - _direct_row_conjunct_size, - row)) { - // check direct conjuncts fail then clear tuple for reuse - // make sure to reset null indicators since we're overwriting - // the tuple assembled for the previous row - tuple->init(_tuple_desc->byte_size()); - break; - } - - COUNTER_UPDATE(_direct_return_counter, 1); - - // 3.5.2 Using pushdown conjuncts to filter data - if (_use_pushdown_conjuncts - && row_conjunct_ctxs->size() > _direct_conjunct_size) { - if (!eval_conjuncts(&((*row_conjunct_ctxs)[_direct_conjunct_size]), - row_conjunct_ctxs->size() - _direct_conjunct_size, row)) { - // check pushdown conjuncts fail then clear tuple for reuse - // make sure to reset null indicators since we're overwriting - // the tuple assembled for the previous row - tuple->init(_tuple_desc->byte_size()); - break; - } - } - - int string_slots_size = _string_slots.size(); - for (int i = 0; i < string_slots_size; ++i) { - StringValue* slot - = tuple->get_string_slot(_string_slots[i]->tuple_offset()); - uint8_t* v = row_batch->tuple_data_pool()->allocate(slot->len); - memcpy(v, slot->ptr, slot->len); - slot->ptr = reinterpret_cast(v); - } - - // check direct && pushdown conjuncts success then commit tuple - row_batch->commit_last_row(); - char* new_tuple = reinterpret_cast(tuple); - new_tuple += _tuple_desc->byte_size(); - tuple = reinterpret_cast(new_tuple); - - COUNTER_UPDATE(_pushdown_return_counter, 1); - } while (0); - - if (row_batch->is_full()) { - break; - } - } -} -#endif - Status OlapScanNode::add_one_batch(RowBatchInterface* row_batch) { { boost::unique_lock l(_row_batches_lock); diff --git a/be/src/exec/olap_scan_node.h b/be/src/exec/olap_scan_node.h index e5869fbde9..f6877cdfc9 100644 --- a/be/src/exec/olap_scan_node.h +++ b/be/src/exec/olap_scan_node.h @@ -51,7 +51,7 @@ class OlapScanNode : public ScanNode { public: OlapScanNode(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs); ~OlapScanNode(); - virtual Status init(const TPlanNode& tnode); + virtual Status init(const TPlanNode& tnode, RuntimeState* state = nullptr); virtual Status prepare(RuntimeState* state); virtual Status open(RuntimeState* state); virtual Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos); @@ -138,12 +138,6 @@ protected: Status split_scan_range(); Status start_scan_thread(RuntimeState* state); - Status create_conjunct_ctxs( - RuntimeState* state, - std::vector* row_expr, - std::vector* vec_expr, - bool disable_codegen); - template Status normalize_predicate(ColumnValueRange& range, SlotDescriptor* slot); @@ -162,23 +156,18 @@ protected: void scanner_thread(OlapScanner* scanner); Status add_one_batch(RowBatchInterface* row_batch); - Status transfer_open_scanners(RuntimeState* state); - - TransferStatus init_merge_heap(Heap& heap); - TransferStatus read_row_batch(RuntimeState* state); - TransferStatus build_row_batch(RuntimeState* state); - TransferStatus sorted_merge(Heap& heap); - - void merge_transfer_thread(RuntimeState* state); // Write debug string of this into out. virtual void debug_string(int indentation_level, std::stringstream* out) const; private: + void _init_counter(RuntimeState* state); + void construct_is_null_pred_in_where_pred(Expr* expr, SlotDescriptor* slot, std::string is_null_str); + friend class OlapScanner; + std::vector _is_null_vector; - boost::scoped_ptr _thrift_plan_node; // Tuple id resolved in prepare() to set _tuple_desc; TupleId _tuple_id; // palo scan node used to scan palo @@ -206,8 +195,6 @@ private: // Order Result Flag bool _is_result_order; - // Result RowBatch order by this column - std::string _sort_column; // Pool for storing allocated scanner objects. We don't want to use the // runtime pool to ensure that the scanner objects are deleted before this @@ -239,45 +226,14 @@ private: std::list _all_olap_scanners; std::list _olap_scanners; - std::vector _fin_olap_scanners; - - // indicate which scanner need to read - // -1 means all - int _merge_scanner_id; - - // each scanner's RowBatch array, index with scanner id - std::vector > _merge_rowbatches; - - // each scanner's lastest RowBatch removed from _merge_rowbatches - // store here because it's lastest TupleRow still in heap - // it will delete at ScanNode's destruct - std::vector _backup_rowbatches; - - // first Rowbatch's row_idx of each _merge_rowbatches - // >0 means row index - // -1 means scanner has finished - std::vector _merge_row_idxs; - - // finish flag of each scanner - std::vector _scanner_fin_flags; - - // present RowBatch MergeTransferThread processing - RowBatch* _merge_rowbatch; - - // present TupleRow MergeTransferThread processing - Tuple* _merge_tuple; int _max_materialized_row_batches; bool _start; bool _scanner_done; bool _transfer_done; - bool _use_pushdown_conjuncts; size_t _direct_conjunct_size; - size_t _direct_row_conjunct_size; - size_t _direct_vec_conjunct_size; boost::posix_time::time_duration _wait_duration; - bool _delete; int _total_assign_num; int _nice; @@ -285,20 +241,34 @@ private: boost::mutex _status_mutex; Status _status; RuntimeState* _runtime_state; - RuntimeProfile::Counter* _olap_thread_scan_timer; - RuntimeProfile::Counter* _eval_timer; - RuntimeProfile::Counter* _merge_timer; - RuntimeProfile::Counter* _pushdown_return_counter; - RuntimeProfile::Counter* _direct_return_counter; + RuntimeProfile::Counter* _scan_timer; RuntimeProfile::Counter* _tablet_counter; - - RuntimeProfile* _scanner_profile; + RuntimeProfile::Counter* _rows_pushed_cond_filtered_counter = nullptr; TResourceInfo* _resource_info; int64_t _buffered_bytes; int64_t _running_thread; EvalConjunctsFn _eval_conjuncts_fn; + + // Counters + RuntimeProfile::Counter* _io_timer = nullptr; + RuntimeProfile::Counter* _read_compressed_counter = nullptr; + RuntimeProfile::Counter* _decompressor_timer = nullptr; + RuntimeProfile::Counter* _read_uncompressed_counter = nullptr; + RuntimeProfile::Counter* _raw_rows_counter = nullptr; + + RuntimeProfile::Counter* _rows_vec_cond_counter = nullptr; + RuntimeProfile::Counter* _vec_cond_timer = nullptr; + + RuntimeProfile::Counter* _stats_filtered_counter = nullptr; + RuntimeProfile::Counter* _del_filtered_counter = nullptr; + + RuntimeProfile::Counter* _block_load_timer = nullptr; + RuntimeProfile::Counter* _block_load_counter = nullptr; + RuntimeProfile::Counter* _block_fetch_timer = nullptr; + + RuntimeProfile::Counter* _index_load_timer = nullptr; }; } // namespace palo diff --git a/be/src/exec/olap_scanner.cpp b/be/src/exec/olap_scanner.cpp index bd106c3af7..a42b43f261 100644 --- a/be/src/exec/olap_scanner.cpp +++ b/be/src/exec/olap_scanner.cpp @@ -21,6 +21,7 @@ #include "olap_scan_node.h" #include "olap_utils.h" #include "olap/olap_reader.h" +#include "olap/field.h" #include "service/backend_options.h" #include "runtime/descriptors.h" #include "runtime/runtime_state.h" @@ -28,6 +29,7 @@ #include "runtime/mem_tracker.h" #include "util/mem_util.hpp" #include "util/network_util.h" +#include "util/palo_metrics.h" namespace palo { @@ -37,134 +39,420 @@ static const std::string MATERIALIZE_TUPLE_TIMER = "MaterializeTupleTime(*)"; OlapScanner::OlapScanner( - RuntimeState* runtime_state, - const boost::shared_ptr scan_range, - const std::vector& key_ranges, - const std::vector& olap_filter, - const TupleDescriptor& tuple_desc, - RuntimeProfile* profile, - const std::vector is_null_vector) : - _runtime_state(runtime_state), - _tuple_desc(tuple_desc), - _scan_range(scan_range), - _key_ranges(key_ranges), - _olap_filter(olap_filter), - _profile(profile), - _is_open(false), - _is_null_vector(is_null_vector) { - _reader.reset(OLAPReader::create(tuple_desc, runtime_state)); + RuntimeState* runtime_state, + OlapScanNode* parent, + bool aggregation, + PaloScanRange* scan_range, + const std::vector& key_ranges) + : _runtime_state(runtime_state), + _parent(parent), + _tuple_desc(parent->_tuple_desc), + _profile(parent->runtime_profile()), + _string_slots(parent->_string_slots), + _is_open(false), + _aggregation(aggregation), + _tuple_idx(parent->_tuple_idx), + _direct_conjunct_size(parent->_direct_conjunct_size) { + _reader.reset(new Reader()); DCHECK(_reader.get() != NULL); + _ctor_status = _prepare(scan_range, key_ranges, parent->_olap_filter, parent->_is_null_vector); + if (!_ctor_status.ok()) { + LOG(WARNING) << "OlapScanner preapre failed, status:" << _ctor_status.get_error_msg(); + } + _rows_read_counter = parent->rows_read_counter(); + _rows_pushed_cond_filtered_counter = parent->_rows_pushed_cond_filtered_counter; } OlapScanner::~OlapScanner() { } -bool OlapScanner::is_open() { - return _is_open; -} -void OlapScanner::set_opened() { - _is_open = true; +Status OlapScanner::_prepare( + PaloScanRange* scan_range, const std::vector& key_ranges, + const std::vector& filters, const std::vector& is_nulls) { + // Get olap table + TTabletId tablet_id = scan_range->scan_range().tablet_id; + SchemaHash schema_hash = + strtoul(scan_range->scan_range().schema_hash.c_str(), nullptr, 10); + _version = + strtoul(scan_range->scan_range().version.c_str(), nullptr, 10); + VersionHash version_hash = + strtoul(scan_range->scan_range().version_hash.c_str(), nullptr, 10); + { + _olap_table = OLAPEngine::get_instance()->get_table(tablet_id, schema_hash); + if (_olap_table.get() == nullptr) { + OLAP_LOG_WARNING("table does not exists. [tablet_id=%ld schema_hash=%d]", + tablet_id, schema_hash); + return Status("table does not exists"); + } + { + AutoRWLock auto_lock(_olap_table->get_header_lock_ptr(), true); + const FileVersionMessage* message = _olap_table->latest_version(); + if (message == NULL) { + OLAP_LOG_WARNING("fail to get latest version. [tablet_id=%ld]", tablet_id); + return Status("fail to get latest version"); + } + + if (message->end_version() == _version + && message->version_hash() != version_hash) { + OLAP_LOG_WARNING("fail to check latest version hash. " + "[tablet_id=%ld version_hash=%ld request_version_hash=%ld]", + tablet_id, message->version_hash(), version_hash); + return Status("fail to check version hash"); + } + } + } + + // Initialize _params + { + RETURN_IF_ERROR(_init_params(key_ranges, filters, is_nulls)); + } + + return Status::OK; } Status OlapScanner::open() { - TFetchRequest fetch_request; - fetch_request.__set_use_compression(false); - fetch_request.__set_num_rows(256); - fetch_request.__set_schema_hash( - strtoul(_scan_range->scan_range().schema_hash.c_str(), NULL, 10)); - fetch_request.__set_version( - strtoul(_scan_range->scan_range().version.c_str(), NULL, 10)); - fetch_request.__set_version_hash( - strtoul(_scan_range->scan_range().version_hash.c_str(), NULL, 10)); - fetch_request.__set_tablet_id(_scan_range->scan_range().tablet_id); + RETURN_IF_ERROR(_ctor_status); - // fields - const std::vector& slots = _tuple_desc.slots(); - if (slots.size() <= 0) { - return Status("Failed to BuildOlapQuery, no query slot!"); + if (_conjunct_ctxs.size() > _direct_conjunct_size) { + _use_pushdown_conjuncts = true; } - for (int i = 0; i < slots.size(); ++i) { - if (!slots[i]->is_materialized()) { - continue; - } - - fetch_request.field.push_back(slots[i]->col_name()); - VLOG(3) << "Slot: name=" << slots[i]->col_name() << " type=" << slots[i]->type(); + auto res = _reader->init(_params); + if (res != OLAP_SUCCESS) { + OLAP_LOG_WARNING("fail to init reader.[res=%d]", res); + return Status("failed to initialize storage reader"); } + return Status::OK; +} - if (fetch_request.field.size() <= 0) { - return Status("Failed to BuildOlapQuery, no materialized slot!"); +Status OlapScanner::_init_params( + const std::vector& key_ranges, + const std::vector& filters, + const std::vector& is_nulls) { + RETURN_IF_ERROR(_init_return_columns()); + + _params.olap_table = _olap_table; + _params.reader_type = READER_FETCH; + _params.aggregation = _aggregation; + _params.version = Version(0, _version); + + // Condition + for (auto& filter : filters) { + _params.conditions.push_back(filter); } - - // begin, end key - for (auto key_range : _key_ranges) { + for (auto& is_null_str : is_nulls) { + _params.conditions.push_back(is_null_str); + } + // Range + for (auto& key_range : key_ranges) { if (key_range.begin_scan_range.size() == 1 && key_range.begin_scan_range[0] == NEGATIVE_INFINITY) { continue; } - fetch_request.__set_range(key_range.begin_include ? "ge" : "gt"); - fetch_request.__set_end_range(key_range.end_include ? "le" : "lt"); - TFetchStartKey start_key; + _params.range = (key_range.begin_include ? "ge" : "gt"); + _params.end_range = (key_range.end_include ? "le" : "lt"); + TFetchStartKey start_key; for (auto key : key_range.begin_scan_range) { start_key.key.push_back(key); } + _params.start_key.push_back(start_key); - fetch_request.start_key.push_back(start_key); TFetchEndKey end_key; - for (auto key : key_range.end_scan_range) { end_key.key.push_back(key); } - - fetch_request.end_key.push_back(end_key); + _params.end_key.push_back(end_key); } + // TODO(zc) + _params.profile = _profile; + _params.runtime_state = _runtime_state; - // where cause - for (auto filter : _olap_filter) { - fetch_request.where.push_back(filter); - } - for (auto is_null_str : _is_null_vector) { - fetch_request.where.push_back(is_null_str); + if (_aggregation) { + _params.return_columns = _return_columns; + } else { + for (size_t i = 0; i < _olap_table->num_key_fields(); ++i) { + _params.return_columns.push_back(i); + } + for (auto index : _return_columns) { + if (_olap_table->tablet_schema()[index].is_key) { + continue; + } else { + _params.return_columns.push_back(index); + } + } } - // output - fetch_request.__set_output("palo2"); - fetch_request.__set_aggregation(_aggregation); - - if (!_reader->init(fetch_request, &_vec_conjunct_ctxs, _profile).ok()) { - std::string local_ip = BackendOptions::get_localhost(); - std::stringstream ss; - if (MemTracker::limit_exceeded(*_runtime_state->mem_trackers())) { - ss << "Memory limit exceeded. Tablet: " << fetch_request.tablet_id << ". host: " << local_ip; - } else { - ss << "Storage Reader init fail. Tablet: " << fetch_request.tablet_id << ". host: " << local_ip; - } - return Status(ss.str()); + // use _params.return_columns, because reader use this to merge sort + OLAPStatus res = _read_row_cursor.init(_olap_table->tablet_schema(), _params.return_columns); + if (res != OLAP_SUCCESS) { + OLAP_LOG_WARNING("fail to init row cursor.[res=%d]", res); + return Status("failed to initialize storage read row cursor"); + } + _read_row_cursor.allocate_memory_for_string_type(_olap_table->tablet_schema()); + for (auto cid : _return_columns) { + _query_fields.push_back(_read_row_cursor.get_field_by_index(cid)); } return Status::OK; } -Status OlapScanner::get_next(Tuple* tuple, int64_t* raw_rows_read, bool* eof) { - if (!_reader->next_tuple(tuple, raw_rows_read, eof).ok()) { - if (MemTracker::limit_exceeded(*_runtime_state->mem_trackers())) { - LOG(ERROR) << "Memory limit exceeded."; - return Status("Internal Error: Memory limit exceeded."); +Status OlapScanner::_init_return_columns() { + for (auto slot : _tuple_desc->slots()) { + if (!slot->is_materialized()) { + continue; } - LOG(ERROR) << "read storage fail."; - return Status("Internal Error: read storage fail."); + int32_t index = _olap_table->get_field_index(slot->col_name()); + if (index < 0) { + std::stringstream ss; + ss << "field name is invalied. field=" << slot->col_name(); + LOG(WARNING) << ss.str(); + return Status(ss.str()); + } + _return_columns.push_back(index); + if (_olap_table->tablet_schema()[index].type == OLAP_FIELD_TYPE_VARCHAR || + _olap_table->tablet_schema()[index].type == OLAP_FIELD_TYPE_HLL) { + _request_columns_size.push_back( + _olap_table->tablet_schema()[index].length - sizeof(StringLengthType)); + } else { + _request_columns_size.push_back(_olap_table->tablet_schema()[index].length); + } + _query_slots.push_back(slot); + } + if (_return_columns.empty()) { + return Status("failed to build storage scanner, no materialized slot!"); } return Status::OK; } +Status OlapScanner::get_batch( + RuntimeState* state, RowBatch* batch, bool* eof) { + // 2. Allocate Row's Tuple buf + uint8_t *tuple_buf = batch->tuple_data_pool()->allocate( + state->batch_size() * _tuple_desc->byte_size()); + bzero(tuple_buf, state->batch_size() * _tuple_desc->byte_size()); + Tuple *tuple = reinterpret_cast(tuple_buf); + + int64_t raw_rows_threshold = raw_rows_read() + config::palo_scanner_row_num; + { + SCOPED_TIMER(_parent->_scan_timer); + while (true) { + // Batch is full, break + if (batch->is_full()) { + break; + } + // Read one row from reader + auto res = _reader->next_row_with_aggregation(&_read_row_cursor, eof); + if (res != OLAP_SUCCESS) { + return Status("Internal Error: read storage fail."); + } + // If we reach end of this scanner, break + if (UNLIKELY(*eof)) { + break; + } + + _num_rows_read++; + + _convert_row_to_tuple(tuple); + if (VLOG_ROW_IS_ON) { + VLOG_ROW << "OlapScanner input row: " << print_tuple(tuple, *_tuple_desc); + } + + // 3.4 Set tuple to RowBatch(not commited) + int row_idx = batch->add_row(); + TupleRow* row = batch->get_row(row_idx); + row->set_tuple(_tuple_idx, tuple); + + do { + // 3.5.1 Using direct conjuncts to filter data + if (_eval_conjuncts_fn != nullptr) { + if (!_eval_conjuncts_fn(&_conjunct_ctxs[0], _direct_conjunct_size, row)) { + // check direct conjuncts fail then clear tuple for reuse + // make sure to reset null indicators since we're overwriting + // the tuple assembled for the previous row + tuple->init(_tuple_desc->byte_size()); + break; + } + } else { + if (!ExecNode::eval_conjuncts(&_conjunct_ctxs[0], _direct_conjunct_size, row)) { + // check direct conjuncts fail then clear tuple for reuse + // make sure to reset null indicators since we're overwriting + // the tuple assembled for the previous row + tuple->init(_tuple_desc->byte_size()); + break; + } + } + + // 3.5.2 Using pushdown conjuncts to filter data + if (_use_pushdown_conjuncts) { + if (!ExecNode::eval_conjuncts( + &_conjunct_ctxs[_direct_conjunct_size], + _conjunct_ctxs.size() - _direct_conjunct_size, row)) { + // check pushdown conjuncts fail then clear tuple for reuse + // make sure to reset null indicators since we're overwriting + // the tuple assembled for the previous row + tuple->init(_tuple_desc->byte_size()); + _num_rows_pushed_cond_filtered++; + break; + } + } + + // Copy string slot + for (auto desc : _string_slots) { + StringValue* slot = tuple->get_string_slot(desc->tuple_offset()); + if (slot->len != 0) { + uint8_t* v = batch->tuple_data_pool()->allocate(slot->len); + memory_copy(v, slot->ptr, slot->len); + slot->ptr = reinterpret_cast(v); + } + } + if (VLOG_ROW_IS_ON) { + VLOG_ROW << "OlapScanner output row: " << print_tuple(tuple, *_tuple_desc); + } + + // check direct && pushdown conjuncts success then commit tuple + batch->commit_last_row(); + char* new_tuple = reinterpret_cast(tuple); + new_tuple += _tuple_desc->byte_size(); + tuple = reinterpret_cast(new_tuple); + + // compute pushdown conjuncts filter rate + if (_use_pushdown_conjuncts) { + // check this rate after + if (_num_rows_read > 32768) { + int32_t pushdown_return_rate + = _num_rows_read * 100 / (_num_rows_read + _num_rows_pushed_cond_filtered); + if (pushdown_return_rate > config::palo_max_pushdown_conjuncts_return_rate) { + _use_pushdown_conjuncts = false; + VLOG(2) << "Stop Using PushDown Conjuncts. " + << "PushDownReturnRate: " << pushdown_return_rate << "%" + << " MaxPushDownReturnRate: " + << config::palo_max_pushdown_conjuncts_return_rate << "%"; + } + } + } + } while (false); + + if (raw_rows_read() >= raw_rows_threshold) { + break; + } + } + } + + return Status::OK; +} + +void OlapScanner::_convert_row_to_tuple(Tuple* tuple) { + char* row = _read_row_cursor.get_buf(); + size_t slots_size = _query_slots.size(); + for (int i = 0; i < slots_size; ++i) { + SlotDescriptor* slot_desc = _query_slots[i]; + const Field* field = _query_fields[i]; + if (field->is_null(row)) { + tuple->set_null(slot_desc->null_indicator_offset()); + continue; + } + char* ptr = (char*)field->get_ptr(row); + size_t len = field->size(); + switch (slot_desc->type().type) { + case TYPE_CHAR: { + StringSlice* slice = reinterpret_cast(ptr); + StringValue *slot = tuple->get_string_slot(slot_desc->tuple_offset()); + slot->ptr = slice->data; + slot->len = strnlen(slot->ptr, slice->size); + break; + } + case TYPE_VARCHAR: + case TYPE_HLL: { + StringSlice* slice = reinterpret_cast(ptr); + StringValue *slot = tuple->get_string_slot(slot_desc->tuple_offset()); + slot->ptr = slice->data; + slot->len = slice->size; + break; + } + case TYPE_DECIMAL: { + DecimalValue *slot = tuple->get_decimal_slot(slot_desc->tuple_offset()); + + // TODO(lingbin): should remove this assign, use set member function + int64_t int_value = *(int64_t*)(ptr); + int32_t frac_value = *(int32_t*)(ptr + sizeof(int64_t)); + *slot = DecimalValue(int_value, frac_value); + break; + } + case TYPE_DATETIME: { + DateTimeValue *slot = tuple->get_datetime_slot(slot_desc->tuple_offset()); + uint64_t value = *reinterpret_cast(ptr); + if (!slot->from_olap_datetime(value)) { + tuple->set_null(slot_desc->null_indicator_offset()); + } + break; + } + case TYPE_DATE: { + DateTimeValue *slot = tuple->get_datetime_slot(slot_desc->tuple_offset()); + uint64_t value = 0; + value = *(unsigned char*)(ptr + 2); + value <<= 8; + value |= *(unsigned char*)(ptr + 1); + value <<= 8; + value |= *(unsigned char*)(ptr); + if (!slot->from_olap_date(value)) { + tuple->set_null(slot_desc->null_indicator_offset()); + } + break; + } + default: { + void *slot = tuple->get_slot(slot_desc->tuple_offset()); + memory_copy(slot, ptr, len); + break; + } + } + } +} + +void OlapScanner::update_counter() { + if (_has_update_counter) { + return; + } + COUNTER_UPDATE(_rows_read_counter, _num_rows_read); + COUNTER_UPDATE(_rows_pushed_cond_filtered_counter, _num_rows_pushed_cond_filtered); + + COUNTER_UPDATE(_parent->_io_timer, _reader->stats().io_ns); + COUNTER_UPDATE(_parent->_read_compressed_counter, _reader->stats().compressed_bytes_read); + COUNTER_UPDATE(_parent->_decompressor_timer, _reader->stats().decompress_ns); + COUNTER_UPDATE(_parent->_read_uncompressed_counter, _reader->stats().uncompressed_bytes_read); + COUNTER_UPDATE(_parent->bytes_read_counter(), _reader->stats().bytes_read); + + COUNTER_UPDATE(_parent->_block_load_timer, _reader->stats().block_load_ns); + COUNTER_UPDATE(_parent->_block_load_counter, _reader->stats().blocks_load); + COUNTER_UPDATE(_parent->_block_fetch_timer, _reader->stats().block_fetch_ns); + + COUNTER_UPDATE(_parent->_raw_rows_counter, _reader->stats().raw_rows_read); + // COUNTER_UPDATE(_parent->_filtered_rows_counter, _reader->stats().num_rows_filtered); + + COUNTER_UPDATE(_parent->_vec_cond_timer, _reader->stats().vec_cond_ns); + COUNTER_UPDATE(_parent->_rows_vec_cond_counter, _reader->stats().rows_vec_cond_filtered); + + COUNTER_UPDATE(_parent->_stats_filtered_counter, _reader->stats().rows_stats_filtered); + COUNTER_UPDATE(_parent->_del_filtered_counter, _reader->stats().rows_del_filtered); + + COUNTER_UPDATE(_parent->_index_load_timer, _reader->stats().index_load_ns); + + PaloMetrics::query_scan_bytes.increment(_reader->stats().compressed_bytes_read); + PaloMetrics::query_scan_rows.increment(_reader->stats().raw_rows_read); + + _has_update_counter = true; +} + Status OlapScanner::close(RuntimeState* state) { + if (_is_closed) { + return Status::OK; + } + update_counter(); _reader.reset(); - Expr::close(_row_conjunct_ctxs, state); - Expr::close(_vec_conjunct_ctxs, state); + Expr::close(_conjunct_ctxs, state); + _is_closed = true; return Status::OK; } diff --git a/be/src/exec/olap_scanner.h b/be/src/exec/olap_scanner.h index af554a5883..d57d9220f3 100644 --- a/be/src/exec/olap_scanner.h +++ b/be/src/exec/olap_scanner.h @@ -16,15 +16,15 @@ #ifndef BDG_PALO_BE_SRC_QUERY_EXEC_OLAP_SCANNER_H #define BDG_PALO_BE_SRC_QUERY_EXEC_OLAP_SCANNER_H -#include -#include #include #include #include +#include #include #include "common/status.h" #include "exec/olap_common.h" +#include "exec/exec_node.h" #include "exprs/expr.h" #include "gen_cpp/PaloInternalService_types.h" #include "gen_cpp/PlanNodes_types.h" @@ -32,38 +32,33 @@ #include "runtime/tuple.h" #include "runtime/vectorized_row_batch.h" +#include "olap/delete_handler.h" +#include "olap/i_data.h" +#include "olap/olap_cond.h" +#include "olap/olap_engine.h" +#include "olap/reader.h" + namespace palo { class OlapScanNode; class OLAPReader; class RuntimeProfile; +class Field; -/** - * @brief µ÷ÓÃengine_reader¶ÁÈ¡olapÊý¾Ý - * Ö§³Ö¶ÁÈ¡¶à¸öscan_range - * ²¢ÇÒ×Ô¶¯ÔÚ¸±±¾¼äÇл» - */ class OlapScanner { public: - /** - * @brief ³õʼ»¯º¯Êý. - * - * @param scan_range ɨÃ跶Χ - */ OlapScanner( RuntimeState* runtime_state, - const boost::shared_ptr scan_range, - const std::vector& key_ranges, - const std::vector& olap_filter, - const TupleDescriptor& tuple_desc, - RuntimeProfile* profile, - const std::vector is_null_vector); + OlapScanNode* parent, + bool aggregation, + PaloScanRange* scan_range, + const std::vector& key_ranges); - virtual ~OlapScanner(); + ~OlapScanner(); Status open(); - Status get_next(Tuple* tuple, int64_t* raw_rows_read, bool* eof); + Status get_batch(RuntimeState* state, RowBatch* batch, bool* eof); Status close(RuntimeState* state); @@ -71,46 +66,76 @@ public: return _runtime_state; } - std::vector* row_conjunct_ctxs() { - return &_row_conjunct_ctxs; + std::vector* conjunct_ctxs() { + return &_conjunct_ctxs; } - std::vector* vec_conjunct_ctxs() { - return &_vec_conjunct_ctxs; - } + int id() const { return _id; } + void set_id(int id) { _id = id; } + bool is_open() const { return _is_open; } + void set_opened() { _is_open = true; } - void set_aggregation(bool aggregation) { - _aggregation = aggregation; - } - - void set_id(int id) { - _id = id; - } - int id() { - return _id; - } - - bool is_open(); - void set_opened(); + int64_t raw_rows_read() const { return _reader->stats().raw_rows_read; } + void update_counter(); private: + Status _prepare( + PaloScanRange* scan_range, + const std::vector& key_ranges, + const std::vector& filters, + const std::vector& is_nulls); + Status _init_params( + const std::vector& key_ranges, + const std::vector& filters, + const std::vector& is_nulls); + Status _init_return_columns(); + void _convert_row_to_tuple(Tuple* tuple); + RuntimeState* _runtime_state; - const TupleDescriptor& _tuple_desc; /**< tuple descripter */ - - const boost::shared_ptr _scan_range; /**< ÇëÇóµÄ²ÎÊýÐÅÏ¢ */ - const std::vector _key_ranges; - const std::vector _olap_filter; + OlapScanNode* _parent; + const TupleDescriptor* _tuple_desc; /**< tuple descripter */ RuntimeProfile* _profile; + const std::vector& _string_slots; - std::vector _row_conjunct_ctxs; - std::vector _vec_conjunct_ctxs; + std::vector _conjunct_ctxs; - std::shared_ptr _reader; - - bool _aggregation; int _id; bool _is_open; - std::vector _is_null_vector; + bool _aggregation; + bool _has_update_counter = false; + + Status _ctor_status; + int _tuple_idx = 0; + int _direct_conjunct_size = 0; + + bool _use_pushdown_conjuncts = false; + + ReaderParams _params; + std::unique_ptr _reader; + + SmartOLAPTable _olap_table; + int64_t _version; + + std::vector _return_columns; + + RowCursor _read_row_cursor; + + std::vector _request_columns_size; + + std::vector _query_slots; + std::vector _query_fields; + + // time costed and row returned statistics + ExecNode::EvalConjunctsFn _eval_conjuncts_fn = nullptr; + + RuntimeProfile::Counter* _rows_read_counter = nullptr; + int64_t _num_rows_read = 0; + + RuntimeProfile::Counter* _rows_pushed_cond_filtered_counter = nullptr; + // number rows filtered by pushed condition + int64_t _num_rows_pushed_cond_filtered = 0; + + bool _is_closed = false; }; } // namespace palo diff --git a/be/src/exec/partitioned_aggregation_node.cc b/be/src/exec/partitioned_aggregation_node.cc index 1dcd509b9c..b04dabd90c 100644 --- a/be/src/exec/partitioned_aggregation_node.cc +++ b/be/src/exec/partitioned_aggregation_node.cc @@ -80,8 +80,8 @@ PartitionedAggregationNode::PartitionedAggregationNode( DCHECK_EQ(PARTITION_FANOUT, 1 << NUM_PARTITIONING_BITS); } -Status PartitionedAggregationNode::init(const TPlanNode& tnode) { - RETURN_IF_ERROR(ExecNode::init(tnode)); +Status PartitionedAggregationNode::init(const TPlanNode& tnode, RuntimeState* state) { + RETURN_IF_ERROR(ExecNode::init(tnode, state)); RETURN_IF_ERROR( Expr::create_expr_trees(_pool, tnode.agg_node.grouping_exprs, &_probe_expr_ctxs)); for (int i = 0; i < tnode.agg_node.aggregate_functions.size(); ++i) { diff --git a/be/src/exec/partitioned_aggregation_node.h b/be/src/exec/partitioned_aggregation_node.h index d9bd430d35..a73e3610f1 100644 --- a/be/src/exec/partitioned_aggregation_node.h +++ b/be/src/exec/partitioned_aggregation_node.h @@ -104,7 +104,7 @@ public: // a null dtor to pass codestyle check virtual ~PartitionedAggregationNode() {} - virtual Status init(const TPlanNode& tnode); + virtual Status init(const TPlanNode& tnode, RuntimeState* state = nullptr); virtual Status prepare(RuntimeState* state); virtual Status open(RuntimeState* state); virtual Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos); @@ -470,12 +470,6 @@ private: // Assumes AGGREGATED_ROWS = false. llvm::Function* codegen_process_batch(); - // Functions to instantiate templated versions of process_batch(). - // The xcompiled versions of these functions are used in codegen_process_batch(). - // TODO: is there a better way to do this? - Status process_batch_false(RowBatch* batch, PartitionedHashTableCtx* ht_ctx); - Status process_batch_true(RowBatch* batch, PartitionedHashTableCtx* ht_ctx); - // We need two buffers per partition, one for the aggregated stream and one // for the unaggregated stream. We need an additional buffer to read the stream // we are currently repartitioning. diff --git a/be/src/exec/partitioned_aggregation_node_ir.cc b/be/src/exec/partitioned_aggregation_node_ir.cc index 9a8a1fb355..5847af6540 100644 --- a/be/src/exec/partitioned_aggregation_node_ir.cc +++ b/be/src/exec/partitioned_aggregation_node_ir.cc @@ -138,14 +138,9 @@ Status PartitionedAggregationNode::append_spilled_row(Partition* partition, Tupl return append_spilled_row(stream, row); } -Status PartitionedAggregationNode::process_batch_false( - RowBatch* batch, PartitionedHashTableCtx* ht_ctx) { - return process_batch(batch, ht_ctx); -} - -Status PartitionedAggregationNode::process_batch_true( - RowBatch* batch, PartitionedHashTableCtx* ht_ctx) { - return process_batch(batch, ht_ctx); -} +template Status PartitionedAggregationNode::process_batch( + RowBatch*, PartitionedHashTableCtx*); +template Status PartitionedAggregationNode::process_batch( + RowBatch*, PartitionedHashTableCtx*); } // end namespace palo diff --git a/be/src/exec/partitioned_hash_table.cc b/be/src/exec/partitioned_hash_table.cc index bfd2335669..81ba76e332 100644 --- a/be/src/exec/partitioned_hash_table.cc +++ b/be/src/exec/partitioned_hash_table.cc @@ -281,9 +281,11 @@ void PartitionedHashTable::close() { for (int i = 0; i < _data_pages.size(); ++i) { _data_pages[i]->del(); } +#if 0 if (PaloMetrics::hash_table_total_bytes() != NULL) { PaloMetrics::hash_table_total_bytes()->increment(-_total_data_page_size); } +#endif _data_pages.clear(); if (_buckets != NULL) { free(_buckets); @@ -372,9 +374,11 @@ bool PartitionedHashTable::grow_node_array() { } _data_pages.push_back(block); _next_node = block->allocate(page_size); +#if 0 if (PaloMetrics::hash_table_total_bytes() != NULL) { PaloMetrics::hash_table_total_bytes()->increment(page_size); } +#endif _node_remaining_current_page = page_size / sizeof(DuplicateNode); _total_data_page_size += page_size; return true; diff --git a/be/src/exec/pl_task_root.cpp b/be/src/exec/pl_task_root.cpp new file mode 100644 index 0000000000..714753552a --- /dev/null +++ b/be/src/exec/pl_task_root.cpp @@ -0,0 +1,152 @@ +// Modifications copyright (C) 2017, Baidu.com, Inc. +// Copyright 2017 The Apache Software Foundation + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "exec/pl_task_root.h" + +namespace palo { + +ExchangeNode::ExchangeNode( + ObjectPool* pool, + const TPlanNode& tnode, + const DescriptorTbl& descs) : + ExecNode(pool, tnode, descs), + _num_senders(0), + _stream_recvr(NULL), + _next_row_idx(0) { +} + +ExchangeNode::~ExchangeNode() { +} + +Status ExchangeNode::init(const TPlanNode& tnode, RuntimeState* state) { + return ExecNode::init(tnode, state); +} + +Status ExchangeNode::prepare(RuntimeState* state) { + RETURN_IF_ERROR(ExecNode::prepare(state)); + + _convert_row_batch_timer = ADD_TIMER(runtime_profile(), "ConvertRowBatchTime"); + + // TODO: figure out appropriate buffer size + DCHECK_GT(_num_senders, 0); + _stream_recvr = state->create_recvr(_row_descriptor, _id, _num_senders, + config::exchg_node_buffer_size_bytes, runtime_profile()); + return Status::OK; +} + +Status ExchangeNode::open(RuntimeState* state) { + SCOPED_TIMER(_runtime_profile->total_time_counter()); + RETURN_IF_ERROR(ExecNode::open(state)); + return Status::OK; +} + +Status ExchangeNode::close(RuntimeState* state) { + if (is_closed()) { + return Status::OK; + } + return ExecNode::close(state); +} + +Status ExchangeNode::get_next(RuntimeState* state, RowBatch* output_batch, bool* eos) { + RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); + SCOPED_TIMER(_runtime_profile->total_time_counter()); + + if (reached_limit()) { + *eos = true; + return Status::OK; + } + + ExprContext* const* ctxs = &_conjunct_ctxs[0]; + int num_ctxs = _conjunct_ctxs.size(); + + while (true) { + { + SCOPED_TIMER(_convert_row_batch_timer); + + // copy rows until we hit the limit/capacity or until we exhaust _input_batch + while (!reached_limit() && !output_batch->is_full() + && _input_batch.get() != NULL && _next_row_idx < _input_batch->capacity()) { + TupleRow* src = _input_batch->get_row(_next_row_idx); + + if (ExecNode::eval_conjuncts(ctxs, num_ctxs, src)) { + int j = output_batch->add_row(); + TupleRow* dest = output_batch->get_row(j); + // if the input row is shorter than the output row, make sure not to leave + // uninitialized Tuple* around + output_batch->clear_row(dest); + // this works as expected if rows from input_batch form a prefix of + // rows in output_batch + _input_batch->copy_row(src, dest); + output_batch->commit_last_row(); + ++_num_rows_returned; + } + + ++_next_row_idx; + } + + COUNTER_SET(_rows_returned_counter, _num_rows_returned); + + if (reached_limit()) { + *eos = true; + return Status::OK; + } + + if (output_batch->is_full()) { + *eos = false; + return Status::OK; + } + } + + // we need more rows + if (_input_batch.get() != NULL) { + _input_batch->transfer_resource_ownership(output_batch); + } + + bool is_cancelled = true; + _input_batch.reset(_stream_recvr->get_batch(&is_cancelled)); + VLOG_FILE << "exch: has batch=" << (_input_batch.get() == NULL ? "false" : "true") + << " #rows=" << (_input_batch.get() != NULL ? _input_batch->num_rows() : 0) + << " is_cancelled=" << (is_cancelled ? "true" : "false") + << " instance_id=" << state->fragment_instance_id(); + + if (is_cancelled) { + return Status::CANCELLED; + } + + *eos = (_input_batch.get() == NULL); + + if (*eos) { + return Status::OK; + } + + _next_row_idx = 0; + DCHECK(_input_batch->row_desc().is_prefix_of(output_batch->row_desc())); + } +} + +void ExchangeNode::debug_string(int indentation_level, std::stringstream* out) const { + *out << string(indentation_level * 2, ' '); + *out << "ExchangeNode(#senders=" << _num_senders; + ExecNode::debug_string(indentation_level, out); + *out << ")"; +} + +} + diff --git a/be/src/exec/pl_task_root.h b/be/src/exec/pl_task_root.h new file mode 100644 index 0000000000..3d3ae6869d --- /dev/null +++ b/be/src/exec/pl_task_root.h @@ -0,0 +1,53 @@ +// Modifications copyright (C) 2017, Baidu.com, Inc. +// Copyright 2017 The Apache Software Foundation. + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "exec/exec_node.h" + +namespace palo { + +// Pull load task root +class PlTaskRoot : public ExecNode { +public: + PlTaskRoot(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs); + virtual ~PlTaskRoot(); + + virtual Status init(const TPlanNode& tnode, RuntimeState* state = nullptr); + virtual Status prepare(RuntimeState* state); + virtual Status open(RuntimeState* state); + virtual Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos); + virtual Status close(RuntimeState* state); + + // the number of senders needs to be set after the c'tor, because it's not + // recorded in TPlanNode, and before calling prepare() + void set_num_senders(int num_senders) { + _num_senders = num_senders; + } + +protected: + virtual void debug_string(int indentation_level, std::stringstream* out) const; + +private: + int _num_senders; // needed for _stream_recvr construction +}; + +} + diff --git a/be/src/exec/schema_scan_node.cpp b/be/src/exec/schema_scan_node.cpp index aebddf2404..87076b4292 100644 --- a/be/src/exec/schema_scan_node.cpp +++ b/be/src/exec/schema_scan_node.cpp @@ -55,7 +55,7 @@ SchemaScanNode::~SchemaScanNode() { _src_tuple = NULL; } -Status SchemaScanNode::init(const TPlanNode& tnode) { +Status SchemaScanNode::init(const TPlanNode& tnode, RuntimeState* state) { RETURN_IF_ERROR(ExecNode::init(tnode)); if (tnode.schema_scan_node.__isset.db) { _scanner_param.db = _pool->add(new std::string(tnode.schema_scan_node.db)); diff --git a/be/src/exec/schema_scan_node.h b/be/src/exec/schema_scan_node.h index bbc1aef14a..88b08684f0 100644 --- a/be/src/exec/schema_scan_node.h +++ b/be/src/exec/schema_scan_node.h @@ -44,27 +44,27 @@ public: // Prepare conjuncts, create Schema columns to slots mapping // initialize _schema_scanner - virtual Status init(const TPlanNode& tnode); + Status init(const TPlanNode& tnode, RuntimeState* state = nullptr) override; // Prepare conjuncts, create Schema columns to slots mapping // initialize _schema_scanner - virtual Status prepare(RuntimeState* state); + Status prepare(RuntimeState* state) override; // Start Schema scan using _schema_scanner. - virtual Status open(RuntimeState* state); + Status open(RuntimeState* state) override; // Fill the next row batch by calling next() on the _schema_scanner, - virtual Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos); + Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) override; // Close the _schema_scanner, and report errors. - virtual Status close(RuntimeState* state); + Status close(RuntimeState* state) override; // this is no use in this class - virtual Status set_scan_ranges(const std::vector& scan_ranges); + Status set_scan_ranges(const std::vector& scan_ranges) override; private: // Write debug string of this into out. - virtual void debug_string(int indentation_level, std::stringstream* out) const; + void debug_string(int indentation_level, std::stringstream* out) const override; // Copy one row from schema table to input tuple void copy_one_row(); diff --git a/be/src/exec/select_node.cpp b/be/src/exec/select_node.cpp index da203fb4af..13d91ceca2 100644 --- a/be/src/exec/select_node.cpp +++ b/be/src/exec/select_node.cpp @@ -58,6 +58,7 @@ Status SelectNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) if (reached_limit() || (_child_row_idx == _child_row_batch->num_rows() && _child_eos)) { // we're already done or we exhausted the last child batch and there won't be any // new ones + _child_row_batch->transfer_resource_ownership(row_batch); *eos = true; return Status::OK; } @@ -65,23 +66,30 @@ Status SelectNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) // start (or continue) consuming row batches from child while (true) { + RETURN_IF_CANCELLED(state); if (_child_row_idx == _child_row_batch->num_rows()) { // fetch next batch - RETURN_IF_CANCELLED(state); - row_batch->tuple_data_pool()->acquire_data(_child_row_batch->tuple_data_pool(), false); - _child_row_batch->reset(); - RETURN_IF_ERROR(child(0)->get_next(state, _child_row_batch.get(), &_child_eos)); _child_row_idx = 0; + _child_row_batch->transfer_resource_ownership(row_batch); + _child_row_batch->reset(); + if (row_batch->at_capacity()) { + return Status::OK; + } + RETURN_IF_ERROR(child(0)->get_next(state, _child_row_batch.get(), &_child_eos)); } if (copy_rows(row_batch)) { *eos = reached_limit() || (_child_row_idx == _child_row_batch->num_rows() && _child_eos); + if (*eos) { + _child_row_batch->transfer_resource_ownership(row_batch); + } return Status::OK; } if (_child_eos) { // finished w/ last child row batch, and child eos is true + _child_row_batch->transfer_resource_ownership(row_batch); *eos = true; return Status::OK; } diff --git a/be/src/exec/sort_node.cpp b/be/src/exec/sort_node.cpp index 02f49f1e3c..cc70dd3a13 100644 --- a/be/src/exec/sort_node.cpp +++ b/be/src/exec/sort_node.cpp @@ -30,14 +30,14 @@ SortNode::SortNode(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl : ExecNode(pool, tnode, descs), _offset(tnode.sort_node.__isset.offset ? tnode.sort_node.offset : 0), _num_rows_skipped(0) { - Status status = init(tnode); + Status status = init(tnode, nullptr); DCHECK(status.ok()) << "SortNode c'tor:init failed: \n" << status.get_error_msg(); } SortNode::~SortNode() { } -Status SortNode::init(const TPlanNode& tnode) { +Status SortNode::init(const TPlanNode& tnode, RuntimeState* state) { const vector* sort_tuple_slot_exprs = tnode.sort_node.__isset.sort_tuple_slot_exprs ? &tnode.sort_node.sort_tuple_slot_exprs : NULL; RETURN_IF_ERROR(_sort_exec_exprs.init(tnode.sort_node.ordering_exprs, diff --git a/be/src/exec/sort_node.h b/be/src/exec/sort_node.h index 0396200ecf..58c287e49a 100644 --- a/be/src/exec/sort_node.h +++ b/be/src/exec/sort_node.h @@ -51,7 +51,7 @@ protected: virtual void debug_string(int indentation_level, std::stringstream* out) const; private: - Status init(const TPlanNode& tnode); + Status init(const TPlanNode& tnode, RuntimeState* state = nullptr); // Fetch input rows and feed them to the sorter until the input is exhausted. Status sort_input(RuntimeState* state); diff --git a/be/src/exec/spill_sort_node.cc b/be/src/exec/spill_sort_node.cc index 5f5ad9a065..2f50560d28 100644 --- a/be/src/exec/spill_sort_node.cc +++ b/be/src/exec/spill_sort_node.cc @@ -37,8 +37,8 @@ SpillSortNode::SpillSortNode(ObjectPool* pool, const TPlanNode& tnode, const Des SpillSortNode::~SpillSortNode() { } -Status SpillSortNode::init(const TPlanNode& tnode) { - RETURN_IF_ERROR(ExecNode::init(tnode)); +Status SpillSortNode::init(const TPlanNode& tnode, RuntimeState* state) { + RETURN_IF_ERROR(ExecNode::init(tnode, state)); RETURN_IF_ERROR(_sort_exec_exprs.init(tnode.sort_node.sort_info, _pool)); _is_asc_order = tnode.sort_node.sort_info.is_asc_order; _nulls_first = tnode.sort_node.sort_info.nulls_first; diff --git a/be/src/exec/spill_sort_node.h b/be/src/exec/spill_sort_node.h index b8078cd37b..70493f4766 100644 --- a/be/src/exec/spill_sort_node.h +++ b/be/src/exec/spill_sort_node.h @@ -42,7 +42,7 @@ public: SpillSortNode(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs); ~SpillSortNode(); - virtual Status init(const TPlanNode& tnode); + virtual Status init(const TPlanNode& tnode, RuntimeState* state = nullptr); virtual Status prepare(RuntimeState* state); virtual Status open(RuntimeState* state); virtual Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos); diff --git a/be/src/exec/text_converter.hpp b/be/src/exec/text_converter.hpp index a770ed812b..2db2c86f95 100644 --- a/be/src/exec/text_converter.hpp +++ b/be/src/exec/text_converter.hpp @@ -108,10 +108,11 @@ inline bool TextConverter::write_slot(const SlotDescriptor* slot_desc, StringParser::string_to_int(data, len, &parse_result); break; - case TYPE_LARGEINT: - *reinterpret_cast<__int128*>(slot) = - StringParser::string_to_int<__int128>(data, len, &parse_result); + case TYPE_LARGEINT: { + __int128 tmp = StringParser::string_to_int<__int128>(data, len, &parse_result); + memcpy(slot, &tmp, sizeof(tmp)); break; + } case TYPE_FLOAT: *reinterpret_cast(slot) = diff --git a/be/src/exec/topn_node.cpp b/be/src/exec/topn_node.cpp index ebe65400e3..f6b6f580c1 100644 --- a/be/src/exec/topn_node.cpp +++ b/be/src/exec/topn_node.cpp @@ -52,8 +52,8 @@ TopNNode::TopNNode(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl TopNNode::~TopNNode() { } -Status TopNNode::init(const TPlanNode& tnode) { - RETURN_IF_ERROR(ExecNode::init(tnode)); +Status TopNNode::init(const TPlanNode& tnode, RuntimeState* state) { + RETURN_IF_ERROR(ExecNode::init(tnode, state)); RETURN_IF_ERROR(_sort_exec_exprs.init(tnode.sort_node.sort_info, _pool)); _is_asc_order = tnode.sort_node.sort_info.is_asc_order; _nulls_first = tnode.sort_node.sort_info.nulls_first; @@ -112,7 +112,7 @@ Status TopNNode::open(RuntimeState* state) { // Limit of 0, no need to fetch anything from children. if (_limit != 0) { RowBatch batch(child(0)->row_desc(), state->batch_size(), mem_tracker()); - bool eos = true; + bool eos = false; do { batch.reset(); diff --git a/be/src/exec/topn_node.h b/be/src/exec/topn_node.h index 5dba2e12b1..80134ce6ff 100644 --- a/be/src/exec/topn_node.h +++ b/be/src/exec/topn_node.h @@ -43,7 +43,7 @@ public: TopNNode(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs); virtual ~TopNNode(); - virtual Status init(const TPlanNode& tnode); + virtual Status init(const TPlanNode& tnode, RuntimeState* state = nullptr); virtual Status prepare(RuntimeState* state); virtual Status open(RuntimeState* state); diff --git a/be/src/exec/union_node.cpp b/be/src/exec/union_node.cpp index 1d9df38587..155fc878df 100644 --- a/be/src/exec/union_node.cpp +++ b/be/src/exec/union_node.cpp @@ -51,10 +51,10 @@ UnionNode::UnionNode(ObjectPool* pool, const TPlanNode& tnode, _to_close_child_idx(-1) { } -Status UnionNode::init(const TPlanNode& tnode) { +Status UnionNode::init(const TPlanNode& tnode, RuntimeState* state) { // TODO(zc): // RETURN_IF_ERROR(ExecNode::init(tnode, state)); - RETURN_IF_ERROR(ExecNode::init(tnode)); + RETURN_IF_ERROR(ExecNode::init(tnode, state)); DCHECK(tnode.__isset.union_node); DCHECK_EQ(_conjunct_ctxs.size(), 0); // Create const_expr_ctx_lists_ from thrift exprs. diff --git a/be/src/exec/union_node.h b/be/src/exec/union_node.h index 650a52bec2..bff6be065e 100644 --- a/be/src/exec/union_node.h +++ b/be/src/exec/union_node.h @@ -46,7 +46,7 @@ class UnionNode : public ExecNode { public: UnionNode(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs); - virtual Status init(const TPlanNode& tnode); + virtual Status init(const TPlanNode& tnode, RuntimeState* state = nullptr); virtual Status prepare(RuntimeState* state); virtual void codegen(RuntimeState* state); virtual Status open(RuntimeState* state); diff --git a/be/src/exprs/CMakeLists.txt b/be/src/exprs/CMakeLists.txt index 1030c02c2a..2670e332d5 100644 --- a/be/src/exprs/CMakeLists.txt +++ b/be/src/exprs/CMakeLists.txt @@ -62,6 +62,8 @@ add_library(Exprs json_functions.cpp operators.cpp hll_hash_function.cpp + agg_fn.cc + new_agg_fn_evaluator.cc ) #ADD_BE_TEST(json_function_test) #ADD_BE_TEST(binary_predicate_test) diff --git a/be/src/exprs/agg_fn.cc b/be/src/exprs/agg_fn.cc new file mode 100644 index 0000000000..b8187e8dfa --- /dev/null +++ b/be/src/exprs/agg_fn.cc @@ -0,0 +1,216 @@ +// Modifications copyright (C) 2017, Baidu.com, Inc. +// Copyright 2017 The Apache Software Foundation + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "exprs/agg_fn.h" + +#include "codegen/llvm_codegen.h" +#include "exprs/anyval_util.h" +#include "runtime/descriptors.h" +#include "runtime/lib_cache.h" +#include "runtime/runtime_state.h" + +#include "common/names.h" + +using namespace palo_udf; +using namespace llvm; + +namespace palo { + +AggFn::AggFn(const TExprNode& tnode, const SlotDescriptor& intermediate_slot_desc, + const SlotDescriptor& output_slot_desc) + : Expr(tnode), + is_merge_(tnode.agg_expr.is_merge_agg), + intermediate_slot_desc_(intermediate_slot_desc), + output_slot_desc_(output_slot_desc) { + // TODO(pengyubing) arg_type_descs_ is used for codegen + // arg_type_descs_(AnyValUtil::column_type_to_type_desc( + // TypeDescriptor::from_thrift(tnode.agg_expr.arg_types))) { + DCHECK(tnode.__isset.fn); + DCHECK(tnode.fn.__isset.aggregate_fn); + // TODO chenhao + DCHECK_EQ(tnode.node_type, TExprNodeType::AGG_EXPR); + DCHECK_EQ(TypeDescriptor::from_thrift(tnode.type).type, + TypeDescriptor::from_thrift(_fn.ret_type).type); + const string& fn_name = _fn.name.function_name; + if (fn_name == "count") { + agg_op_ = COUNT; + } else if (fn_name == "min") { + agg_op_ = MIN; + } else if (fn_name == "max") { + agg_op_ = MAX; + } else if (fn_name == "sum" || fn_name == "sum_init_zero") { + agg_op_ = SUM; + } else if (fn_name == "avg") { + agg_op_ = AVG; + } else if (fn_name == "ndv" || fn_name == "ndv_no_finalize") { + agg_op_ = NDV; + } else if (fn_name == "multi_distinct_count") { + agg_op_ = COUNT_DISTINCT; + } else if (fn_name == "multi_distinct_sum") { + agg_op_ = SUM_DISTINCT; + } else { + agg_op_ = OTHER; + } +} + +Status AggFn::Init(const RowDescriptor& row_desc, RuntimeState* state) { + // TODO chenhao , calling expr's prepare in NewAggFnEvaluator create + // Initialize all children (i.e. input exprs to this aggregate expr). + //for (Expr* input_expr : children()) { + // RETURN_IF_ERROR(input_expr->prepare(row_desc, state)); + //} + + // Initialize the aggregate expressions' internals. + const TAggregateFunction& aggregate_fn = _fn.aggregate_fn; + DCHECK_EQ(intermediate_slot_desc_.type().type, + TypeDescriptor::from_thrift(aggregate_fn.intermediate_type).type); + DCHECK_EQ(output_slot_desc_.type().type, TypeDescriptor::from_thrift(_fn.ret_type).type); + + // Load the function pointers. Must have init() and update(). + if (aggregate_fn.init_fn_symbol.empty() || + aggregate_fn.update_fn_symbol.empty() || + (aggregate_fn.merge_fn_symbol.empty() && !aggregate_fn.is_analytic_only_fn)) { + // This path is only for partially implemented builtins. + DCHECK_EQ(_fn.binary_type, TFunctionBinaryType::BUILTIN); + stringstream ss; + ss << "Function " << _fn.name.function_name << " is not implemented."; + return Status(ss.str()); + } + + RETURN_IF_ERROR(LibCache::instance()->get_so_function_ptr(_fn.hdfs_location, + aggregate_fn.init_fn_symbol, &init_fn_, &_cache_entry)); + RETURN_IF_ERROR(LibCache::instance()->get_so_function_ptr(_fn.hdfs_location, + aggregate_fn.update_fn_symbol, &update_fn_, &_cache_entry)); + + // Merge() is not defined for purely analytic function. + if (!aggregate_fn.is_analytic_only_fn) { + RETURN_IF_ERROR(LibCache::instance()->get_so_function_ptr(_fn.hdfs_location, + aggregate_fn.merge_fn_symbol, &merge_fn_, &_cache_entry)); + } + // Serialize(), GetValue(), Remove() and Finalize() are optional + if (!aggregate_fn.serialize_fn_symbol.empty()) { + RETURN_IF_ERROR(LibCache::instance()->get_so_function_ptr(_fn.hdfs_location, + aggregate_fn.serialize_fn_symbol, &serialize_fn_, &_cache_entry)); + } + if (!aggregate_fn.get_value_fn_symbol.empty()) { + RETURN_IF_ERROR(LibCache::instance()->get_so_function_ptr(_fn.hdfs_location, + aggregate_fn.get_value_fn_symbol, &get_value_fn_, &_cache_entry)); + } + if (!aggregate_fn.remove_fn_symbol.empty()) { + RETURN_IF_ERROR(LibCache::instance()->get_so_function_ptr(_fn.hdfs_location, + aggregate_fn.remove_fn_symbol, &remove_fn_, &_cache_entry)); + } + if (!aggregate_fn.finalize_fn_symbol.empty()) { + RETURN_IF_ERROR(LibCache::instance()->get_so_function_ptr(_fn.hdfs_location, + _fn.aggregate_fn.finalize_fn_symbol, &finalize_fn_, &_cache_entry)); + } + return Status::OK; +} + +Status AggFn::Create(const TExpr& texpr, const RowDescriptor& row_desc, + const SlotDescriptor& intermediate_slot_desc, const SlotDescriptor& output_slot_desc, + RuntimeState* state, AggFn** agg_fn) { + *agg_fn = nullptr; + ObjectPool* pool = state->obj_pool(); + const TExprNode& texpr_node = texpr.nodes[0]; + //TODO chenhao + DCHECK_EQ(texpr_node.node_type, TExprNodeType::AGG_EXPR); + if (!texpr_node.__isset.fn) { + return Status("Function not set in thrift AGGREGATE_EXPR node"); + } + AggFn* new_agg_fn = + pool->add(new AggFn(texpr_node, intermediate_slot_desc, output_slot_desc)); + RETURN_IF_ERROR(Expr::create_tree(texpr, pool, new_agg_fn)); + Status status = new_agg_fn->Init(row_desc, state); + if (UNLIKELY(!status.ok())) { + new_agg_fn->Close(); + return status; + } + for (Expr* input_expr : new_agg_fn->children()) { + int fn_ctx_idx = 0; + input_expr->assign_fn_ctx_idx(&fn_ctx_idx); + } + *agg_fn = new_agg_fn; + return Status::OK; +} + +FunctionContext::TypeDesc AggFn::GetIntermediateTypeDesc() const { + return AnyValUtil::column_type_to_type_desc(intermediate_slot_desc_.type()); +} + +FunctionContext::TypeDesc AggFn::GetOutputTypeDesc() const { + return AnyValUtil::column_type_to_type_desc(output_slot_desc_.type()); +} + +//Status AggFn::CodegenUpdateOrMergeFunction(LlvmCodeGen* codegen, Function** uda_fn) { +// const string& symbol = +// is_merge_ ? fn_.aggregate_fn.merge_fn_symbol : _fn.aggregate_fn.update_fn_symbol; +// std::vector fn_arg_types; +// for (Expr* input_expr : children()) { +// fn_arg_types.push_back(input_expr->type()); +// } +// // The intermediate value is passed as the last argument. +// fn_arg_types.push_back(intermediate_type()); +// RETURN_IF_ERROR(codegen->LoadFunction(_fn, symbol, nullptr, fn_arg_types, +// fn_arg_types.size(), false, uda_fn, &_cache_entry)); +// +// // Inline constants into the function body (if there is an IR body). +// if (!(*uda_fn)->isDeclaration()) { +// // TODO: IMPALA-4785: we should also replace references to GetIntermediateType() +// // with constants. +// codegen->InlineConstFnAttrs(GetOutputTypeDesc(), arg_type_descs_, *uda_fn); +// *uda_fn = codegen->FinalizeFunction(*uda_fn); +// if (*uda_fn == nullptr) { +// return Status(TErrorCode::UDF_VERIFY_FAILED, symbol, fn_.hdfs_location); +// } +// } +// return Status::OK; +//} + +void AggFn::Close() { + // This also closes all the input expressions. + Expr::close(); +} + +void AggFn::Close(const vector& exprs) { + for (AggFn* expr : exprs) expr->Close(); +} + +string AggFn::DebugString() const { + stringstream out; + out << "AggFn(op=" << agg_op_; + for (Expr* input_expr : children()) { + out << " " << input_expr->debug_string() << ")"; + } + out << ")"; + return out.str(); +} + +string AggFn::DebugString(const vector& agg_fns) { + stringstream out; + out << "["; + for (int i = 0; i < agg_fns.size(); ++i) { + out << (i == 0 ? "" : " ") << agg_fns[i]->DebugString(); + } + out << "]"; + return out.str(); +} + +} diff --git a/be/src/exprs/agg_fn.h b/be/src/exprs/agg_fn.h new file mode 100644 index 0000000000..9772a0b580 --- /dev/null +++ b/be/src/exprs/agg_fn.h @@ -0,0 +1,195 @@ +// Modifications copyright (C) 2017, Baidu.com, Inc. +// Copyright 2017 The Apache Software Foundation + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef BDG_PALO_BE_SRC_QUERY_NEW_EXPRS_AGG_FN_H +#define BDG_PALO_BE_SRC_QUERY_NEW_EXPRS_AGG_FN_H + +#include "exprs/expr.h" +#include "runtime/descriptors.h" +#include "udf/udf.h" + +namespace palo { + +using palo_udf::FunctionContext; + +class LlvmCodeGen; +class MemPool; +class MemTracker; +class ObjectPool; +class RuntimeState; +class Tuple; +class TupleRow; +class TExprNode; + +/// --- AggFn overview +/// +/// An aggregate function generates an output over a set of tuple rows. +/// An example would be AVG() which computes the average of all input rows. +/// The built-in aggregate functions such as min, max, sum, avg, ndv etc are +/// in this category. +/// +/// --- Implementation +/// +/// AggFn contains the aggregation operations, pointers to the UDAF interface functions +/// implementing various states of aggregation and the descriptors for the intermediate +/// and output values. Please see udf/udf.h for details of the UDAF interfaces. +/// +/// AggFnEvaluator is the interface for evaluating aggregate functions against input +/// tuple rows. It invokes the following functions at different phases of the aggregation: +/// +/// init_fn_ : An initialization function that initializes the aggregate value. +/// +/// update_fn_ : An update function that processes the arguments for each row in the +/// query result set and accumulates an intermediate result. For example, +/// this function might increment a counter, append to a string buffer or +/// add the input to a culmulative sum. +/// +/// merge_fn_ : A merge function that combines multiple intermediate results into a +/// single value. +/// +/// serialize_fn_: A serialization function that flattens any intermediate values +/// containing pointers, and frees any memory allocated during the init, +/// update and merge phases. +/// +/// finalize_fn_ : A finalize function that either passes through the combined result +/// unchanged, or does one final transformation. Also frees the resources +/// allocated during init, update and merge phases. +/// +/// get_value_fn_: Used by AnalyticEval node to obtain the current intermediate value. +/// +/// remove_fn_ : Used by AnalyticEval node to undo the update to the intermediate value +/// by an input row as it falls out of a sliding window. +/// +class AggFn : public Expr { + public: + + /// Override the base class' implementation. + virtual bool IsAggFn() const { return true; } + + /// Enum for some built-in aggregation ops. + enum AggregationOp { + COUNT, + MIN, + MAX, + SUM, + AVG, + NDV, + SUM_DISTINCT, + COUNT_DISTINCT, + HLL_UNION_AGG, + OTHER, + }; + + /// Creates and initializes an aggregate function from 'texpr' and returns it in + /// 'agg_fn'. The returned AggFn lives in the ObjectPool of 'state'. 'row_desc' is + /// the row descriptor of the input tuple row; 'intermediate_slot_desc' is the slot + /// descriptor of the intermediate value; 'output_slot_desc' is the slot descriptor + /// of the output value. On failure, returns error status and sets 'agg_fn' to NULL. + static Status Create(const TExpr& texpr, const RowDescriptor& row_desc, + const SlotDescriptor& intermediate_slot_desc, + const SlotDescriptor& output_slot_desc, RuntimeState* state, AggFn** agg_fn) + WARN_UNUSED_RESULT; + + bool is_merge() const { return is_merge_; } + AggregationOp agg_op() const { return agg_op_; } + bool is_count_star() const { return agg_op_ == COUNT && _children.empty(); } + bool is_count_distinct() const { return agg_op_ == COUNT_DISTINCT; } + bool is_sum_distinct() const { return agg_op_ == SUM_DISTINCT; } + bool is_builtin() const { return _fn.binary_type == TFunctionBinaryType::BUILTIN; } + const std::string& fn_name() const { return _fn.name.function_name; } + const TypeDescriptor& intermediate_type() const { return intermediate_slot_desc_.type(); } + const SlotDescriptor& intermediate_slot_desc() const { return intermediate_slot_desc_; } + // Output type is the same as Expr::type(). + const SlotDescriptor& output_slot_desc() const { return output_slot_desc_; } + void* remove_fn() const { return remove_fn_; } + void* merge_or_update_fn() const { return is_merge_ ? merge_fn_ : update_fn_; } + void* serialize_fn() const { return serialize_fn_; } + void* get_value_fn() const { return get_value_fn_; } + void* finalize_fn() const { return finalize_fn_; } + bool SupportsRemove() const { return remove_fn_ != nullptr; } + bool SupportsSerialize() const { return serialize_fn_ != nullptr; } + FunctionContext::TypeDesc GetIntermediateTypeDesc() const; + FunctionContext::TypeDesc GetOutputTypeDesc() const; + const std::vector& arg_type_descs() const { + return arg_type_descs_; + } + + /// Generates an IR wrapper function to call update_fn_/merge_fn_ which may either be + /// cross-compiled or loaded from an external library. The generated IR function is + /// returned in 'uda_fn'. Returns error status on failure. + /// TODO: implement codegen path for init, finalize, serialize functions etc. + Status CodegenUpdateOrMergeFunction(LlvmCodeGen* codegen, llvm::Function** uda_fn) + WARN_UNUSED_RESULT; + + Status get_codegend_compute_fn(RuntimeState* state, llvm::Function** fn) { + return Status::OK; + } + + /// Releases all cache entries to libCache for all nodes in the expr tree. + virtual void Close(); + static void Close(const std::vector& exprs); + + Expr* clone(ObjectPool* pool) const { + return nullptr; + } + + virtual std::string DebugString() const; + static std::string DebugString(const std::vector& exprs); + +private: + friend class Expr; + friend class NewAggFnEvaluator; + + /// True if this is a merging aggregation. + const bool is_merge_; + + /// Slot into which Update()/Merge()/Serialize() write their result. Not owned. + const SlotDescriptor& intermediate_slot_desc_; + + /// Slot into which Finalize() results are written. Not owned. Identical to + /// intermediate_slot_desc_ if this agg fn has the same intermediate and result type. + const SlotDescriptor& output_slot_desc_; + + /// The types of the arguments to the aggregate function. + const std::vector arg_type_descs_; + + /// The aggregation operation. + AggregationOp agg_op_; + + /// Function pointers for the different phases of the aggregate function. + void* init_fn_ = nullptr; + void* update_fn_ = nullptr; + void* remove_fn_ = nullptr; + void* merge_fn_ = nullptr; + void* serialize_fn_ = nullptr; + void* get_value_fn_ = nullptr; + void* finalize_fn_ = nullptr; + + AggFn(const TExprNode& node, const SlotDescriptor& intermediate_slot_desc, + const SlotDescriptor& output_slot_desc); + + /// Initializes the AggFn and its input expressions. May load the UDAF from LibCache + /// if necessary. + virtual Status Init(const RowDescriptor& desc, RuntimeState* state) WARN_UNUSED_RESULT; +}; + +} + +#endif diff --git a/be/src/exprs/agg_fn_evaluator.cpp b/be/src/exprs/agg_fn_evaluator.cpp index 5023d6b277..92498028e2 100755 --- a/be/src/exprs/agg_fn_evaluator.cpp +++ b/be/src/exprs/agg_fn_evaluator.cpp @@ -31,6 +31,7 @@ #include "udf/udf_internal.h" #include "util/debug_util.h" #include "runtime/datetime_value.h" +#include "runtime/mem_tracker.h" #include "thrift/protocol/TDebugProtocol.h" #include "runtime/raw_value.h" @@ -340,7 +341,7 @@ inline void AggFnEvaluator::set_any_val( return; case TYPE_LARGEINT: - reinterpret_cast(dst)->val = *reinterpret_cast(slot); + memcpy(&reinterpret_cast(dst)->val, slot, sizeof(__int128)); return; default: @@ -409,7 +410,7 @@ inline void AggFnEvaluator::set_output_slot(const AnyVal* src, return; case TYPE_LARGEINT: { - *reinterpret_cast<__int128*>(slot) = reinterpret_cast(src)->val; + memcpy(slot, &reinterpret_cast(src)->val, sizeof(__int128)); return; } diff --git a/be/src/exprs/agg_fn_evaluator.h b/be/src/exprs/agg_fn_evaluator.h index 6d44894dac..cfc4e00f17 100755 --- a/be/src/exprs/agg_fn_evaluator.h +++ b/be/src/exprs/agg_fn_evaluator.h @@ -95,6 +95,10 @@ public: void close(RuntimeState* state); + const TypeDescriptor& intermediate_type() const { + return _intermediate_slot_desc->type(); + } + //PrimitiveType type() const { return _type.type; } AggregationOp agg_op() const { return _agg_op; @@ -194,6 +198,11 @@ public: const std::string& fn_name() const { return _fn.name.function_name; } + + const SlotDescriptor* output_slot_desc() const { + return _output_slot_desc; + } + private: const TFunction _fn; diff --git a/be/src/exprs/aggregate_functions.cpp b/be/src/exprs/aggregate_functions.cpp index 6f3f25e4c3..9a4dd6017f 100644 --- a/be/src/exprs/aggregate_functions.cpp +++ b/be/src/exprs/aggregate_functions.cpp @@ -22,11 +22,13 @@ #include #include +#include #include "common/logging.h" #include "runtime/string_value.h" #include "runtime/datetime_value.h" #include "exprs/anyval_util.h" +#include "exprs/hybird_set.h" #include "util/debug_util.h" // TODO: this file should be cross compiled and then all of the builtin @@ -1073,6 +1075,647 @@ int64_t AggregateFunctions::hll_algorithm(const palo_udf::StringVal& src) { return (int64_t)(estimate + 0.5); } +// TODO chenhao , reduce memory copy +// multi distinct state for numertic +// serialize order type:value:value:value ... +template +class MultiDistinctNumericState { +public: + + static void create(StringVal* dst) { + dst->is_null = false; + const int state_size = sizeof(MultiDistinctNumericState); + MultiDistinctNumericState* state = new MultiDistinctNumericState(); + if (std::is_same::value) { + state->_type = FunctionContext::TYPE_TINYINT; + } else if (std::is_same::value) { + state->_type = FunctionContext::TYPE_SMALLINT; + } else if (std::is_same::value) { + state->_type = FunctionContext::TYPE_INT; + } else if (std::is_same::value) { + state->_type = FunctionContext::TYPE_BIGINT; + } else if (std::is_same::value) { + state->_type = FunctionContext::TYPE_LARGEINT; + } else if (std::is_same::value) { + state->_type = FunctionContext::TYPE_DOUBLE; + } else if (std::is_same::value) { + state->_type = FunctionContext::TYPE_FLOAT; + } else { + DCHECK(false); + } + dst->len = state_size; + dst->ptr = (uint8_t*)state; + } + + static void destory(const StringVal& dst) { + delete (MultiDistinctNumericState*)dst.ptr; + } + + void update(T& t) { + _set.insert(t); + } + + // type:one byte value:sizeof(T) + StringVal serialize(FunctionContext* ctx) { + size_t type_size = sizeof(((T*)0)->val); + const size_t serialized_set_length = sizeof(uint8_t) + type_size * _set.size(); + StringVal result(ctx, serialized_set_length); + uint8_t* type_writer = result.ptr; + // type + *type_writer = (uint8_t)_type; + type_writer++; + // value + for (auto& value : _set) { + memcpy(type_writer, &value.val, type_size); + type_writer += type_size; + } + return result; + } + + void unserialize(StringVal& src) { + size_t type_size = sizeof(((T*)0)->val); + const uint8_t* type_reader = src.ptr; + const uint8_t* end = src.ptr + src.len; + // type + _type = (FunctionContext::Type)*type_reader; + type_reader++; + // value + while (type_reader < end) { + T value; + value.is_null = false; + memcpy(&value.val, type_reader, type_size); + _set.insert(value); + type_reader += type_size; + } + } + + // merge set + void merge(MultiDistinctNumericState& state) { + _set.insert(state._set.begin(), state._set.end()); + } + + // count + BigIntVal count_finalize() { + return BigIntVal(_set.size()); + } + + // sum for double, decimal + DoubleVal sum_finalize_double() { + double sum = 0; + for (auto& value : _set) { + sum += value.val; + } + return DoubleVal(sum); + } + + // sum for largeint + LargeIntVal sum_finalize_largeint() { + __int128 sum = 0; + for (auto& value : _set) { + sum += value.val; + } + return LargeIntVal(sum); + } + + // sum for tinyint, smallint, int, bigint + BigIntVal sum_finalize_bigint() { + int64_t sum = 0; + for (auto& value : _set) { + sum += value.val; + } + return BigIntVal(sum); + } + + FunctionContext::Type set_type() { + return _type; + } + +private: + + class NumericHashHelper { + public: + size_t operator()(const T& obj) const { + size_t result = AnyValUtil::hash64_murmur(obj, HashUtil::MURMUR_SEED); + return result; + } + }; + + std::unordered_set _set; + // Because Anyval does not provide the hash function, in order + // to adopt the type different from the template, the pointer is used + // HybirdSetBase* _set; + // _type is serialized into buffer by one byte + FunctionContext::Type _type; +}; + +// multi distinct state for string +// // serialize order type:len:value:len:value ... +class MultiDistinctStringCountState { +public: + + static void create(StringVal* dst) { + dst->is_null = false; + const int state_size = sizeof(MultiDistinctStringCountState); + MultiDistinctStringCountState* state = new MultiDistinctStringCountState(); + // type length + state->_type = FunctionContext::TYPE_STRING; + dst->len = state_size; + dst->ptr = (uint8_t*)state; + } + + static void destory(const StringVal& dst) { + delete (MultiDistinctStringCountState*)dst.ptr; + } + + inline void update(StringValue* sv) { + _set.insert(sv); + } + + StringVal serialize(FunctionContext* ctx) { + // calculate total serialize buffer length + int total_serialized_set_length = 1; + HybirdSetBase::IteratorBase* iterator = _set.begin(); + while (iterator->has_next()) { + const StringValue* value = + reinterpret_cast(iterator->get_value()); + total_serialized_set_length += STRING_LENGTH_RECORD_LENGTH + value->len; + iterator->next(); + } + StringVal result(ctx, total_serialized_set_length); + uint8_t* writer = result.ptr; + // type + *writer = _type; + writer ++; + iterator = _set.begin(); + while (iterator->has_next()) { + const StringValue* value = reinterpret_cast + (iterator->get_value()); + // length, it is unnecessary to consider little or big endian for + // all running in little-endian. + *(int*)writer = value->len; + writer += STRING_LENGTH_RECORD_LENGTH; + // value + memcpy(writer, value->ptr, value->len); + writer += value->len; + iterator->next(); + } + return result; + } + + void unserialize(StringVal& src) { + uint8_t* reader = src.ptr; + // skip type ,no used now + _type = (FunctionContext::Type)*reader; + DCHECK(_type == FunctionContext::TYPE_STRING); + reader ++; + const uint8_t* end = src.ptr + src.len; + while (reader < end) { + const int length = *(int*)reader; + reader += STRING_LENGTH_RECORD_LENGTH; + StringValue value((char*)reader, length); + _set.insert(&value); + reader += length; + } + DCHECK(reader == end); + } + + // merge set + void merge(MultiDistinctStringCountState& state) { + _set.insert(&(state._set)); + } + + BigIntVal finalize() { + return BigIntVal(_set.size()); + } + + FunctionContext::Type set_type() { + return _type; + } + + static const int STRING_LENGTH_RECORD_LENGTH = 4; +private: + + StringValueSet _set; + // _type is serialized into buffer by one byte + FunctionContext::Type _type; +}; + +// multi distinct state for decimal +// // serialize order type:int_len:frac_len:sign:int_len ... +class MultiDistinctDecimalState { +public: + + static void create(StringVal* dst) { + dst->is_null = false; + const int state_size = sizeof(MultiDistinctDecimalState); + MultiDistinctDecimalState* state = new MultiDistinctDecimalState(); + state->_type = FunctionContext::TYPE_DECIMAL; + dst->len = state_size; + dst->ptr = (uint8_t*)state; + } + + static void destory(const StringVal& dst) { + delete (MultiDistinctDecimalState*)dst.ptr; + } + + void update(DecimalVal& t) { + _set.insert(DecimalValue::from_decimal_val(t)); + } + + // type:one byte value:sizeof(T) + StringVal serialize(FunctionContext* ctx) { + const int serialized_set_length = sizeof(uint8_t) + + (DECIMAL_INT_LEN_BYTE_SIZE + + DECIMAL_FRAC_BYTE_SIZE + + DECIMAL_SIGN_BYTE_SIZE + + DECIMAL_BUFFER_BYTE_SIZE) * _set.size(); + StringVal result(ctx, serialized_set_length); + uint8_t* writer = result.ptr; + *writer = (uint8_t)_type; + writer++; + // for int_length and frac_length, uint8_t will not overflow. + for (auto& value : _set) { + *writer = value._int_length; + writer += DECIMAL_INT_LEN_BYTE_SIZE; + *writer = value._frac_length; + writer += DECIMAL_FRAC_BYTE_SIZE; + *writer = value._sign; + writer += DECIMAL_SIGN_BYTE_SIZE; + memcpy(writer, value._buffer, DECIMAL_BUFFER_BYTE_SIZE); + writer += DECIMAL_BUFFER_BYTE_SIZE; + } + return result; + } + + void unserialize(StringVal& src) { + const uint8_t* reader = src.ptr; + // type + _type = (FunctionContext::Type)*reader; + reader++; + const uint8_t* end = src.ptr + src.len; + // value + while (reader < end) { + DecimalValue value; + value._int_length = *reader; + reader += DECIMAL_INT_LEN_BYTE_SIZE; + value._frac_length = *reader; + reader += DECIMAL_FRAC_BYTE_SIZE; + value._sign = *reader; + reader += DECIMAL_SIGN_BYTE_SIZE; + value._buffer_length = DECIMAL_BUFF_LENGTH; + memcpy(value._buffer, reader, DECIMAL_BUFFER_BYTE_SIZE); + reader += DECIMAL_BUFFER_BYTE_SIZE; + _set.insert(value); + } + } + + FunctionContext::Type set_type() { + return _type; + } + + // merge set + void merge(MultiDistinctDecimalState& state) { + _set.insert(state._set.begin(), state._set.end()); + } + + // count + BigIntVal count_finalize() { + return BigIntVal(_set.size()); + } + + DecimalVal sum_finalize() { + DecimalValue sum; + for (auto& value : _set) { + sum += value; + } + DecimalVal result; + sum.to_decimal_val(&result); + return result; + } + +private: + + const int DECIMAL_INT_LEN_BYTE_SIZE = 1; + const int DECIMAL_FRAC_BYTE_SIZE = 1; + const int DECIMAL_SIGN_BYTE_SIZE = 1; + const int DECIMAL_BUFFER_BYTE_SIZE = 36; + + std::unordered_set _set; + FunctionContext::Type _type; +}; + +// multi distinct state for date +// // serialize order type:packed_time:type:packed_time:type ... +class MultiDistinctCountDateState { +public: + + static void create(StringVal* dst) { + dst->is_null = false; + const int state_size = sizeof(MultiDistinctCountDateState); + MultiDistinctCountDateState* state = new MultiDistinctCountDateState(); + state->_type = FunctionContext::TYPE_DATETIME; + dst->len = state_size; + dst->ptr = (uint8_t*)state; + } + + static void destory(const StringVal& dst) { + delete (MultiDistinctCountDateState*)dst.ptr; + } + + void update(DateTimeVal& t) { + _set.insert(t); + } + + // type:one byte value:sizeof(T) + StringVal serialize(FunctionContext* ctx) { + const int serialized_set_length = sizeof(uint8_t) + + (DATETIME_PACKED_TIME_BYTE_SIZE + DATETIME_TYPE_BYTE_SIZE) * _set.size(); + StringVal result(ctx, serialized_set_length); + uint8_t* writer = result.ptr; + // type + *writer = (uint8_t)_type; + writer++; + // value + for (auto& value : _set) { + int64_t* packed_time_writer = (int64_t*)writer; + *packed_time_writer = value.packed_time; + writer += DATETIME_PACKED_TIME_BYTE_SIZE; + int* type_writer = (int*)writer; + *type_writer = value.type; + writer += DATETIME_TYPE_BYTE_SIZE; + } + return result; + } + + void unserialize(StringVal& src) { + const uint8_t* reader = src.ptr; + // type + _type = (FunctionContext::Type)*reader; + reader++; + const uint8_t* end = src.ptr + src.len; + // value + while (reader < end) { + DateTimeVal value; + value.is_null = false; + int64_t* packed_time_reader = (int64_t*)reader; + value.packed_time = *packed_time_reader; + reader += DATETIME_PACKED_TIME_BYTE_SIZE; + int* type_reader = (int*)reader; + value.type = *type_reader; + reader += DATETIME_TYPE_BYTE_SIZE; + _set.insert(value); + } + } + + // merge set + void merge(MultiDistinctCountDateState& state) { + _set.insert(state._set.begin(), state._set.end()); + } + + // count + BigIntVal count_finalize() { + return BigIntVal(_set.size()); + } + + FunctionContext::Type set_type() { + return _type; + } + +private: + + class DateTimeHashHelper { + public: + size_t operator()(const DateTimeVal& obj) const { + size_t result = AnyValUtil::hash64_murmur(obj, HashUtil::MURMUR_SEED); + return result; + } + }; + + const int DATETIME_PACKED_TIME_BYTE_SIZE = 8; + const int DATETIME_TYPE_BYTE_SIZE = 4; + + std::unordered_set _set; + FunctionContext::Type _type; +}; + +template +void AggregateFunctions::count_or_sum_distinct_numeric_init(FunctionContext* ctx, StringVal* dst) { + MultiDistinctNumericState::create(dst); +} + +void AggregateFunctions::count_distinct_string_init(FunctionContext* ctx, StringVal* dst) { + MultiDistinctStringCountState::create(dst); +} + +void AggregateFunctions::count_or_sum_distinct_decimal_init(FunctionContext* ctx, StringVal* dst) { + MultiDistinctDecimalState::create(dst); +} + +void AggregateFunctions::count_distinct_date_init(FunctionContext* ctx, StringVal* dst) { + MultiDistinctCountDateState::create(dst); +} + +template +void AggregateFunctions::count_or_sum_distinct_numeric_update(FunctionContext* ctx, T& src, + StringVal* dst) { + DCHECK(!dst->is_null); + if (src.is_null) return; + MultiDistinctNumericState* state = reinterpret_cast*>(dst->ptr); + state->update(src); +} + +void AggregateFunctions::count_distinct_string_update(FunctionContext* ctx, StringVal& src, + StringVal* dst) { + DCHECK(!dst->is_null); + if (src.is_null) return; + MultiDistinctStringCountState* state = reinterpret_cast(dst->ptr); + StringValue sv = StringValue::from_string_val(src); + state->update(&sv); +} + +void AggregateFunctions::count_or_sum_distinct_decimal_update(FunctionContext* ctx, DecimalVal& src, + StringVal* dst) { + DCHECK(!dst->is_null); + if (src.is_null) return; + MultiDistinctDecimalState* state = reinterpret_cast(dst->ptr); + state->update(src); +} + +void AggregateFunctions::count_distinct_date_update(FunctionContext* ctx, DateTimeVal& src, + StringVal* dst) { + DCHECK(!dst->is_null); + if (src.is_null) return; + MultiDistinctCountDateState* state = reinterpret_cast(dst->ptr); + state->update(src); +} + +template +void AggregateFunctions::count_or_sum_distinct_numeric_merge(FunctionContext* ctx, StringVal& src, + StringVal* dst) { + DCHECK(!dst->is_null); + DCHECK(!src.is_null); + MultiDistinctNumericState* dst_state = reinterpret_cast*>(dst->ptr); + // unserialize src + StringVal src_state_val; + MultiDistinctNumericState::create(&src_state_val); + MultiDistinctNumericState* src_state = reinterpret_cast*>(src_state_val.ptr); + src_state->unserialize(src); + DCHECK(dst_state->set_type() == src_state->set_type()); + dst_state->merge(*src_state); + MultiDistinctNumericState::destory(src_state_val); +} + +void AggregateFunctions::count_distinct_string_merge(FunctionContext* ctx, StringVal& src, + StringVal* dst) { + DCHECK(!dst->is_null); + DCHECK(!src.is_null); + MultiDistinctStringCountState* dst_state = reinterpret_cast(dst->ptr); + // unserialize src + StringVal src_state_val; + MultiDistinctStringCountState::create(&src_state_val); + MultiDistinctStringCountState* src_state = reinterpret_cast(src_state_val.ptr); + src_state->unserialize(src); + DCHECK(dst_state->set_type() == src_state->set_type()); + dst_state->merge(*src_state); + MultiDistinctStringCountState::destory(src_state_val); +} + + +void AggregateFunctions::count_or_sum_distinct_decimal_merge(FunctionContext* ctx, StringVal& src, + StringVal* dst) { + DCHECK(!dst->is_null); + DCHECK(!src.is_null); + MultiDistinctDecimalState* dst_state = reinterpret_cast(dst->ptr); + // unserialize src + StringVal src_state_val; + MultiDistinctDecimalState::create(&src_state_val); + MultiDistinctDecimalState* src_state = reinterpret_cast(src_state_val.ptr); + src_state->unserialize(src); + DCHECK(dst_state->set_type() == src_state->set_type()); + dst_state->merge(*src_state); + MultiDistinctDecimalState::destory(src_state_val); +} + +void AggregateFunctions::count_distinct_date_merge(FunctionContext* ctx, StringVal& src, + StringVal* dst) { + DCHECK(!dst->is_null); + DCHECK(!src.is_null); + MultiDistinctCountDateState* dst_state = reinterpret_cast(dst->ptr); + // unserialize src + StringVal src_state_val; + MultiDistinctCountDateState::create(&src_state_val); + MultiDistinctCountDateState* src_state = reinterpret_cast(src_state_val.ptr); + src_state->unserialize(src); + DCHECK(dst_state->set_type() == src_state->set_type()); + dst_state->merge(*src_state); + MultiDistinctCountDateState::destory(src_state_val); +} + +template +StringVal AggregateFunctions::count_or_sum_distinct_numeric_serialize(FunctionContext* ctx, const StringVal& state_sv) { + DCHECK(!state_sv.is_null); + MultiDistinctNumericState* state = reinterpret_cast*>(state_sv.ptr); + StringVal result = state->serialize(ctx); + // release original object + MultiDistinctNumericState::destory(state_sv); + return result; +} + +StringVal AggregateFunctions::count_distinct_string_serialize(FunctionContext* ctx, const StringVal& state_sv) { + DCHECK(!state_sv.is_null); + MultiDistinctStringCountState* state = reinterpret_cast(state_sv.ptr); + StringVal result = state->serialize(ctx); + // release original object + MultiDistinctStringCountState::destory(state_sv); + return result; +} + +StringVal AggregateFunctions::count_or_sum_distinct_decimal_serialize(FunctionContext* ctx, const StringVal& state_sv) { + DCHECK(!state_sv.is_null); + MultiDistinctDecimalState* state = reinterpret_cast(state_sv.ptr); + StringVal result = state->serialize(ctx); + // release original object + MultiDistinctDecimalState::destory(state_sv); + return result; +} + +StringVal AggregateFunctions::count_distinct_date_serialize(FunctionContext* ctx, const StringVal& state_sv) { + DCHECK(!state_sv.is_null); + MultiDistinctCountDateState* state = reinterpret_cast(state_sv.ptr); + StringVal result = state->serialize(ctx); + // release original object + MultiDistinctCountDateState::destory(state_sv); + return result; +} + +template +BigIntVal AggregateFunctions::count_or_sum_distinct_numeric_finalize(FunctionContext* ctx, const StringVal& state_sv) { + DCHECK(!state_sv.is_null); + MultiDistinctNumericState* state = reinterpret_cast*>(state_sv.ptr); + BigIntVal result = state->count_finalize(); + MultiDistinctNumericState::destory(state_sv); + return result; +} + +BigIntVal AggregateFunctions::count_distinct_string_finalize(FunctionContext* ctx, const StringVal& state_sv) { + DCHECK(!state_sv.is_null); + MultiDistinctStringCountState* state = reinterpret_cast(state_sv.ptr); + BigIntVal result = state->finalize(); + MultiDistinctStringCountState::destory(state_sv); + return result; +} + +template +DoubleVal AggregateFunctions::sum_distinct_double_finalize(FunctionContext* ctx, const StringVal& state_sv) { + DCHECK(!state_sv.is_null); + MultiDistinctNumericState* state = reinterpret_cast*>(state_sv.ptr); + DoubleVal result = state->sum_finalize_double(); + MultiDistinctNumericState::destory(state_sv); + return result; +} + +template +LargeIntVal AggregateFunctions::sum_distinct_largeint_finalize(FunctionContext* ctx, const StringVal& state_sv) { + DCHECK(!state_sv.is_null); + MultiDistinctNumericState* state = reinterpret_cast*>(state_sv.ptr); + LargeIntVal result = state->sum_finalize_largeint(); + MultiDistinctNumericState::destory(state_sv); + return result; +} + +template +BigIntVal AggregateFunctions::sum_distinct_bigint_finalize(FunctionContext* ctx, const StringVal& state_sv) { + DCHECK(!state_sv.is_null); + MultiDistinctNumericState* state = reinterpret_cast*>(state_sv.ptr); + BigIntVal result = state->sum_finalize_bigint(); + MultiDistinctNumericState::destory(state_sv); + return result; +} + +BigIntVal AggregateFunctions::count_distinct_decimal_finalize(FunctionContext* ctx, const StringVal& state_sv) { + DCHECK(!state_sv.is_null); + MultiDistinctDecimalState* state = reinterpret_cast(state_sv.ptr); + BigIntVal result = state->count_finalize(); + MultiDistinctDecimalState::destory(state_sv); + return result; +} + +DecimalVal AggregateFunctions::sum_distinct_decimal_finalize(FunctionContext* ctx, const StringVal& state_sv) { + DCHECK(!state_sv.is_null); + MultiDistinctDecimalState* state = reinterpret_cast(state_sv.ptr); + DecimalVal result = state->sum_finalize(); + MultiDistinctDecimalState::destory(state_sv); + return result; +} + +BigIntVal AggregateFunctions::count_distinct_date_finalize(FunctionContext* ctx, const StringVal& state_sv) { + DCHECK(!state_sv.is_null); + MultiDistinctCountDateState* state = reinterpret_cast(state_sv.ptr); + BigIntVal result = state->count_finalize(); + MultiDistinctCountDateState::destory(state_sv); + return result; +} + // An implementation of a simple single pass variance algorithm. A standard UDA must // be single pass (i.e. does not scan the table more than once), so the most canonical // two pass approach is not practical. @@ -1139,6 +1782,7 @@ DoubleVal AggregateFunctions::knuth_var_finalize(FunctionContext* ctx, const Str KnuthVarianceState* state = reinterpret_cast(state_sv.ptr); if (state->count == 0 || state->count == 1) return DoubleVal::null(); double variance = compute_knuth_variance(*state, false); + ctx->free(state_sv.ptr); return DoubleVal(variance); } @@ -1148,7 +1792,9 @@ DoubleVal AggregateFunctions::knuth_var_pop_finalize(FunctionContext* ctx, DCHECK_EQ(state_sv.len, sizeof(KnuthVarianceState)); KnuthVarianceState* state = reinterpret_cast(state_sv.ptr); if (state->count == 0) return DoubleVal::null(); - return compute_knuth_variance(*state, true); + double variance = compute_knuth_variance(*state, true); + ctx->free(state_sv.ptr); + return DoubleVal(variance); } DoubleVal AggregateFunctions::knuth_stddev_finalize(FunctionContext* ctx, @@ -1157,7 +1803,9 @@ DoubleVal AggregateFunctions::knuth_stddev_finalize(FunctionContext* ctx, DCHECK_EQ(state_sv.len, sizeof(KnuthVarianceState)); KnuthVarianceState* state = reinterpret_cast(state_sv.ptr); if (state->count == 0 || state->count == 1) return DoubleVal::null(); - return sqrt(compute_knuth_variance(*state, false)); + double variance = sqrt(compute_knuth_variance(*state, false)); + ctx->free(state_sv.ptr); + return DoubleVal(variance); } DoubleVal AggregateFunctions::knuth_stddev_pop_finalize(FunctionContext* ctx, @@ -1166,7 +1814,9 @@ DoubleVal AggregateFunctions::knuth_stddev_pop_finalize(FunctionContext* ctx, DCHECK_EQ(state_sv.len, sizeof(KnuthVarianceState)); KnuthVarianceState* state = reinterpret_cast(state_sv.ptr); if (state->count == 0) return DoubleVal::null(); - return sqrt(compute_knuth_variance(*state, true)); + double variance = sqrt(compute_knuth_variance(*state, true)); + ctx->free(state_sv.ptr); + return DoubleVal(variance); } struct RankState { @@ -1520,6 +2170,97 @@ template void AggregateFunctions::hll_update( template void AggregateFunctions::hll_update( FunctionContext*, const DecimalVal&, StringVal*); +template void AggregateFunctions::count_or_sum_distinct_numeric_init( + FunctionContext* ctx, StringVal* dst); +template void AggregateFunctions::count_or_sum_distinct_numeric_init( + FunctionContext* ctx, StringVal* dst); +template void AggregateFunctions::count_or_sum_distinct_numeric_init( + FunctionContext* ctx, StringVal* dst); +template void AggregateFunctions::count_or_sum_distinct_numeric_init( + FunctionContext* ctx, StringVal* dst); +template void AggregateFunctions::count_or_sum_distinct_numeric_init( + FunctionContext* ctx, StringVal* dst); +template void AggregateFunctions::count_or_sum_distinct_numeric_init( + FunctionContext* ctx, StringVal* dst); +template void AggregateFunctions::count_or_sum_distinct_numeric_init( + FunctionContext* ctx, StringVal* dst); + + +template void AggregateFunctions::count_or_sum_distinct_numeric_update( + FunctionContext* ctx, TinyIntVal& src, StringVal* dst); +template void AggregateFunctions::count_or_sum_distinct_numeric_update( + FunctionContext* ctx, SmallIntVal& src, StringVal* dst); +template void AggregateFunctions::count_or_sum_distinct_numeric_update( + FunctionContext* ctx, IntVal& src, StringVal* dst); +template void AggregateFunctions::count_or_sum_distinct_numeric_update( + FunctionContext* ctx, BigIntVal& src, StringVal* dst); +template void AggregateFunctions::count_or_sum_distinct_numeric_update( + FunctionContext* ctx, FloatVal& src, StringVal* dst); +template void AggregateFunctions::count_or_sum_distinct_numeric_update( + FunctionContext* ctx, DoubleVal& src, StringVal* dst); +template void AggregateFunctions::count_or_sum_distinct_numeric_update( + FunctionContext* ctx, LargeIntVal& src, StringVal* dst); + +template void AggregateFunctions::count_or_sum_distinct_numeric_merge( + FunctionContext* ctx, StringVal& src, StringVal* dst); +template void AggregateFunctions::count_or_sum_distinct_numeric_merge( + FunctionContext* ctx, StringVal& src, StringVal* dst); +template void AggregateFunctions::count_or_sum_distinct_numeric_merge( + FunctionContext* ctx, StringVal& src, StringVal* dst); +template void AggregateFunctions::count_or_sum_distinct_numeric_merge( + FunctionContext* ctx, StringVal& src, StringVal* dst); +template void AggregateFunctions::count_or_sum_distinct_numeric_merge( + FunctionContext* ctx, StringVal& src, StringVal* dst); +template void AggregateFunctions::count_or_sum_distinct_numeric_merge( + FunctionContext* ctx, StringVal& src, StringVal* dst); +template void AggregateFunctions::count_or_sum_distinct_numeric_merge( + FunctionContext* ctx, StringVal& src, StringVal* dst); + +template StringVal AggregateFunctions::count_or_sum_distinct_numeric_serialize( + FunctionContext* ctx, const StringVal& state_sv); +template StringVal AggregateFunctions::count_or_sum_distinct_numeric_serialize( + FunctionContext* ctx, const StringVal& state_sv); +template StringVal AggregateFunctions::count_or_sum_distinct_numeric_serialize( + FunctionContext* ctx, const StringVal& state_sv); +template StringVal AggregateFunctions::count_or_sum_distinct_numeric_serialize( + FunctionContext* ctx, const StringVal& state_sv); +template StringVal AggregateFunctions::count_or_sum_distinct_numeric_serialize( + FunctionContext* ctx, const StringVal& state_sv); +template StringVal AggregateFunctions::count_or_sum_distinct_numeric_serialize( + FunctionContext* ctx, const StringVal& state_sv); +template StringVal AggregateFunctions::count_or_sum_distinct_numeric_serialize( + FunctionContext* ctx, const StringVal& state_sv); + +template BigIntVal AggregateFunctions::count_or_sum_distinct_numeric_finalize( + FunctionContext* ctx, const StringVal& state_sv); +template BigIntVal AggregateFunctions::count_or_sum_distinct_numeric_finalize( + FunctionContext* ctx, const StringVal& state_sv); +template BigIntVal AggregateFunctions::count_or_sum_distinct_numeric_finalize( + FunctionContext* ctx, const StringVal& state_sv); +template BigIntVal AggregateFunctions::count_or_sum_distinct_numeric_finalize( + FunctionContext* ctx, const StringVal& state_sv); +template BigIntVal AggregateFunctions::count_or_sum_distinct_numeric_finalize( + FunctionContext* ctx, const StringVal& state_sv); +template BigIntVal AggregateFunctions::count_or_sum_distinct_numeric_finalize( + FunctionContext* ctx, const StringVal& state_sv); +template BigIntVal AggregateFunctions::count_or_sum_distinct_numeric_finalize( + FunctionContext* ctx, const StringVal& state_sv); + +template BigIntVal AggregateFunctions::sum_distinct_bigint_finalize( + FunctionContext* ctx, const StringVal& state_sv); +template BigIntVal AggregateFunctions::sum_distinct_bigint_finalize( + FunctionContext* ctx, const StringVal& state_sv); +template BigIntVal AggregateFunctions::sum_distinct_bigint_finalize( + FunctionContext* ctx, const StringVal& state_sv); +template BigIntVal AggregateFunctions::sum_distinct_bigint_finalize( + FunctionContext* ctx, const StringVal& state_sv); + +template DoubleVal AggregateFunctions::sum_distinct_double_finalize( + FunctionContext* ctx, const StringVal& state_sv); + +template LargeIntVal AggregateFunctions::sum_distinct_largeint_finalize( + FunctionContext* ctx, const StringVal& state_sv); + template void AggregateFunctions::knuth_var_update( FunctionContext*, const TinyIntVal&, StringVal*); template void AggregateFunctions::knuth_var_update( diff --git a/be/src/exprs/aggregate_functions.h b/be/src/exprs/aggregate_functions.h index 0a46aa3fd3..b743688ac2 100644 --- a/be/src/exprs/aggregate_functions.h +++ b/be/src/exprs/aggregate_functions.h @@ -24,11 +24,12 @@ //#include "exprs/opcode_registry.h" #include "udf/udf.h" #include "udf/udf_internal.h" -#include "olap/field.h" +#include "olap/hll.h" namespace palo { class HllSetResolver; +class HybirdSetBase; // Collection of builtin aggregate functions. Aggregate functions implement // the various phases of the aggregation: Init(), Update(), Serialize(), Merge(), @@ -187,6 +188,47 @@ dst); palo_udf::FunctionContext*, const palo_udf::StringVal& src); + // count and sum distinct algorithm in multi distinct + template + static void count_or_sum_distinct_numeric_init(palo_udf::FunctionContext* ctx, palo_udf::StringVal* dst); + template + static void count_or_sum_distinct_numeric_update(FunctionContext* ctx, T& src, StringVal* dst); + template + static void count_or_sum_distinct_numeric_merge(FunctionContext* ctx, StringVal& src, StringVal* dst); + template + static StringVal count_or_sum_distinct_numeric_serialize(FunctionContext* ctx, const StringVal& state_sv); + template + static BigIntVal count_or_sum_distinct_numeric_finalize(FunctionContext* ctx, const StringVal& state_sv); + + // count distinct in multi distinct for string + static void count_distinct_string_init(palo_udf::FunctionContext* ctx, palo_udf::StringVal* dst); + static void count_distinct_string_update(FunctionContext* ctx, StringVal& src, StringVal* dst); + static void count_distinct_string_merge(FunctionContext* ctx, StringVal& src, StringVal* dst); + static StringVal count_distinct_string_serialize(FunctionContext* ctx, const StringVal& state_sv); + static BigIntVal count_distinct_string_finalize(FunctionContext* ctx, const StringVal& state_sv); + + // count distinct in multi distinct for decimal + static void count_or_sum_distinct_decimal_init(palo_udf::FunctionContext* ctx, palo_udf::StringVal* dst); + static void count_or_sum_distinct_decimal_update(FunctionContext* ctx, DecimalVal& src, StringVal* dst); + static void count_or_sum_distinct_decimal_merge(FunctionContext* ctx, StringVal& src, StringVal* dst); + static StringVal count_or_sum_distinct_decimal_serialize(FunctionContext* ctx, const StringVal& state_sv); + static BigIntVal count_distinct_decimal_finalize(FunctionContext* ctx, const StringVal& state_sv); + static DecimalVal sum_distinct_decimal_finalize(FunctionContext* ctx, const StringVal& state_sv); + + // count distinct in multi disticnt for Date + static void count_distinct_date_init(palo_udf::FunctionContext* ctx, palo_udf::StringVal* dst); + static void count_distinct_date_update(FunctionContext* ctx, DateTimeVal& src, StringVal* dst); + static void count_distinct_date_merge(FunctionContext* ctx, StringVal& src, StringVal* dst); + static StringVal count_distinct_date_serialize(FunctionContext* ctx, const StringVal& state_sv); + static BigIntVal count_distinct_date_finalize(FunctionContext* ctx, const StringVal& state_sv); + + template + static BigIntVal sum_distinct_bigint_finalize(FunctionContext* ctx, const StringVal& state_sv); + template + static LargeIntVal sum_distinct_largeint_finalize(FunctionContext* ctx, const StringVal& state_sv); + template + static DoubleVal sum_distinct_double_finalize(FunctionContext* ctx, const StringVal& state_sv); + /// Knuth's variance algorithm, more numerically stable than canonical stddev /// algorithms; reference implementation: /// http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online_algorithm diff --git a/be/src/exprs/anyval_util.cpp b/be/src/exprs/anyval_util.cpp index c704ca69db..6efd6ac846 100755 --- a/be/src/exprs/anyval_util.cpp +++ b/be/src/exprs/anyval_util.cpp @@ -20,6 +20,10 @@ #include "exprs/anyval_util.h" +#include "exprs/anyval_util.h" +#include "runtime/mem_pool.h" +#include "runtime/mem_tracker.h" + namespace palo { using palo_udf::BooleanVal; using palo_udf::TinyIntVal; @@ -34,6 +38,20 @@ using palo_udf::DateTimeVal; using palo_udf::StringVal; using palo_udf::AnyVal; +Status allocate_any_val(RuntimeState* state, MemPool* pool, const TypeDescriptor& type, + const std::string& mem_limit_exceeded_msg, AnyVal** result) { + const int anyval_size = AnyValUtil::any_val_size(type); + const int anyval_alignment = AnyValUtil::any_val_alignment(type); + *result = + reinterpret_cast(pool->try_allocate_aligned(anyval_size, anyval_alignment)); + if (*result == NULL) { + return pool->mem_tracker()->MemLimitExceeded( + state, mem_limit_exceeded_msg, anyval_size); + } + memset(*result, 0, anyval_size); + return Status::OK; +} + AnyVal* create_any_val(ObjectPool* pool, const TypeDescriptor& type) { switch (type.type) { case TYPE_NULL: diff --git a/be/src/exprs/anyval_util.h b/be/src/exprs/anyval_util.h index 0378fcf64e..e83f99273c 100755 --- a/be/src/exprs/anyval_util.h +++ b/be/src/exprs/anyval_util.h @@ -25,9 +25,12 @@ #include "runtime/primitive_type.h" #include "udf/udf.h" #include "util/hash_util.hpp" +#include "common/status.h" namespace palo { +class MemPool; + // Utilities for AnyVals class AnyValUtil { public: @@ -195,6 +198,7 @@ public: return palo_udf::FunctionContext::TYPE_DATE; case TYPE_DATETIME: return palo_udf::FunctionContext::TYPE_DATETIME; + case TYPE_HLL: case TYPE_CHAR: case TYPE_VARCHAR: return palo_udf::FunctionContext::TYPE_STRING; @@ -233,6 +237,8 @@ public: case TYPE_DOUBLE: return sizeof(palo_udf::DoubleVal); + case TYPE_HLL: + case TYPE_CHAR: case TYPE_VARCHAR: return sizeof(palo_udf::StringVal); @@ -249,6 +255,31 @@ public: } } + /// Returns the byte alignment of *Val for type t. + static int any_val_alignment(const TypeDescriptor& t) { + switch (t.type) { + case TYPE_BOOLEAN: return alignof(BooleanVal); + case TYPE_TINYINT: return alignof(TinyIntVal); + case TYPE_SMALLINT: return alignof(SmallIntVal); + case TYPE_INT: return alignof(IntVal); + case TYPE_BIGINT: return alignof(BigIntVal); + case TYPE_LARGEINT: return alignof(LargeIntVal); + case TYPE_FLOAT: return alignof(FloatVal); + case TYPE_DOUBLE: return alignof(DoubleVal); + case TYPE_HLL: + case TYPE_VARCHAR: + case TYPE_CHAR: + return alignof(StringVal); + case TYPE_DATETIME: + case TYPE_DATE: + return alignof(DateTimeVal); + case TYPE_DECIMAL: return alignof(DecimalVal); + default: + DCHECK(false) << t; + return 0; + } + } + static std::string to_string(const StringVal& v) { return std::string(reinterpret_cast(v.ptr), v.len); } @@ -308,8 +339,7 @@ public: *reinterpret_cast(slot); return; case TYPE_LARGEINT: - reinterpret_cast(dst)->val = - *reinterpret_cast(slot); + memcpy(&reinterpret_cast(dst)->val, slot, sizeof(__int128)); return; case TYPE_FLOAT: reinterpret_cast(dst)->val = @@ -405,6 +435,12 @@ inline bool AnyValUtil::equals_intenal(const DecimalVal& x, const DecimalVal& y) // Creates the corresponding AnyVal subclass for type. The object is added to the pool. palo_udf::AnyVal* create_any_val(ObjectPool* pool, const TypeDescriptor& type); -} +/// Allocates an AnyVal subclass of 'type' from 'pool'. The AnyVal's memory is +/// initialized to all 0's. Returns a MemLimitExceeded() error with message +/// 'mem_limit_exceeded_msg' if the allocation cannot be made because of a memory +/// limit. +Status allocate_any_val(RuntimeState* state, MemPool* pool, const TypeDescriptor& type, + const std::string& mem_limit_exceeded_msg, AnyVal** result); +} #endif diff --git a/be/src/exprs/expr.cpp b/be/src/exprs/expr.cpp index 2edc97ad45..092c8f0cb3 100644 --- a/be/src/exprs/expr.cpp +++ b/be/src/exprs/expr.cpp @@ -21,6 +21,7 @@ #include "exprs/expr.h" #include +#include #include #include @@ -60,6 +61,7 @@ using llvm::Instruction; using llvm::CallInst; using llvm::ConstantInt; using llvm::Value; +using std::vector; namespace palo { const char* Expr::_s_llvm_class_name = "class.palo::Expr"; @@ -919,8 +921,8 @@ int Expr::inline_constants(LlvmCodeGen* codegen, Function* fn) { Function* called_fn = call_instr->getCalledFunction(); // Look for call to Expr::GetConstant() - if (called_fn == NULL - || called_fn->getName().find(_s_get_constant_symbol_prefix) == string::npos) { + if (called_fn == NULL || + called_fn->getName().find(_s_get_constant_symbol_prefix) == std::string::npos) { continue; } @@ -975,4 +977,118 @@ Expr* Expr::copy(ObjectPool* pool, Expr* old_expr) { return new_expr; } +void Expr::assign_fn_ctx_idx(int* next_fn_ctx_idx) { + _fn_ctx_idx_start = *next_fn_ctx_idx; + if (has_fn_ctx()) { + _fn_ctx_idx = *next_fn_ctx_idx; + ++(*next_fn_ctx_idx); + } + for (Expr* child : children()) child->assign_fn_ctx_idx(next_fn_ctx_idx); + _fn_ctx_idx_end = *next_fn_ctx_idx; +} + + +Status Expr::create(const TExpr& texpr, const RowDescriptor& row_desc, + RuntimeState* state, ObjectPool* pool, Expr** scalar_expr, + MemTracker* tracker) { + *scalar_expr = nullptr; + Expr* root; + RETURN_IF_ERROR(create_expr(pool, texpr.nodes[0], &root)); + RETURN_IF_ERROR(create_tree(texpr, pool, root)); + // TODO pengyubing replace by Init() + ExprContext* ctx = pool->add(new ExprContext(root)); + // TODO chenhao check node type in ScalarExpr Init() + Status status = Status::OK; + if (texpr.nodes[0].node_type != TExprNodeType::CASE_EXPR) { + status = root->prepare(state, row_desc, ctx); + } + if (UNLIKELY(!status.ok())) { + root->close(); + return status; + } + int fn_ctx_idx = 0; + root->assign_fn_ctx_idx(&fn_ctx_idx); + *scalar_expr = root; + return Status::OK; +} + +Status Expr::create(const vector& texprs, const RowDescriptor& row_desc, + RuntimeState* state, ObjectPool* pool, vector* exprs, MemTracker* tracker) { + exprs->clear(); + for (const TExpr& texpr: texprs) { + Expr* expr; + RETURN_IF_ERROR(create(texpr, row_desc, state, pool, &expr, tracker)); + DCHECK(expr != nullptr); + exprs->push_back(expr); + } + return Status::OK; +} + +Status Expr::create(const TExpr& texpr, const RowDescriptor& row_desc, + RuntimeState* state, Expr** scalar_expr, MemTracker* tracker) { + return Expr::create(texpr, row_desc, state, state->obj_pool(), scalar_expr, tracker); +} + +Status Expr::create(const vector& texprs, const RowDescriptor& row_desc, + RuntimeState* state, vector* exprs, MemTracker* tracker) { + return Expr::create(texprs, row_desc, state, state->obj_pool(), exprs, tracker); +} + +Status Expr::create_tree(const TExpr& texpr, ObjectPool* pool, Expr* root) { + DCHECK(!texpr.nodes.empty()); + DCHECK(root != nullptr); + // The root of the tree at nodes[0] is already created and stored in 'root'. + int child_node_idx = 0; + int num_children = texpr.nodes[0].num_children; + for (int i = 0; i < num_children; ++i) { + ++child_node_idx; + Status status = create_tree_internal(texpr.nodes, pool, root, &child_node_idx); + if (UNLIKELY(!status.ok())) { + LOG(ERROR) << "Could not construct expr tree.\n" << status.get_error_msg() << "\n" + << apache::thrift::ThriftDebugString(texpr); + return status; + } + } + if (UNLIKELY(child_node_idx + 1 != texpr.nodes.size())) { + return Status("Expression tree only partially reconstructed. Not all thrift " \ + "nodes were used."); + } + return Status::OK; +} + +Status Expr::create_tree_internal(const vector& nodes, ObjectPool* pool, + Expr* root, int* child_node_idx) { + // propagate error case + if (*child_node_idx >= nodes.size()) { + return Status("Failed to reconstruct expression tree from thrift."); + } + + const TExprNode& texpr_node = nodes[*child_node_idx]; + DCHECK_NE(texpr_node.node_type, TExprNodeType::AGG_EXPR); + Expr* child_expr; + RETURN_IF_ERROR(create_expr(pool, texpr_node, &child_expr)); + root->_children.push_back(child_expr); + + int num_children = nodes[*child_node_idx].num_children; + for (int i = 0; i < num_children; ++i) { + *child_node_idx += 1; + RETURN_IF_ERROR(create_tree_internal(nodes, pool, child_expr, child_node_idx)); + DCHECK(child_expr->get_child(i) != nullptr); + } + return Status::OK; +} + +// TODO chenhao +void Expr::close() { + for (Expr* child : _children) child->close(); + /*if (_cache_entry != nullptr) { + LibCache::instance()->decrement_use_count(_cache_entry); + _cache_entry = nullptr; + }*/ +} + +void Expr::close(const vector& exprs) { + for (Expr* expr : exprs) expr->close(); +} + } diff --git a/be/src/exprs/expr.h b/be/src/exprs/expr.h index b0420b218b..3c77b6ac90 100644 --- a/be/src/exprs/expr.h +++ b/be/src/exprs/expr.h @@ -67,6 +67,7 @@ class SetVar; class TupleIsNullPredicate; class VectorizedRowBatch; class Literal; +class MemTracker; // This is the superclass of all expr evaluation nodes. class Expr { @@ -165,6 +166,12 @@ public: return _is_slotref; } + /// Returns true if this expr uses a FunctionContext to track its runtime state. + /// Overridden by exprs which use FunctionContext. + virtual bool has_fn_ctx() const { + return false; + } + /// Returns an error status if the function context associated with the /// expr has an error set. Status get_fn_context_error(ExprContext* ctx); @@ -198,6 +205,27 @@ public: static Status create_expr_trees(ObjectPool* pool, const std::vector& texprs, std::vector* ctxs); + /// Create a new ScalarExpr based on thrift Expr 'texpr'. The newly created ScalarExpr + /// is stored in ObjectPool 'pool' and returned in 'expr' on success. 'row_desc' is the + /// tuple row descriptor of the input tuple row. On failure, 'expr' is set to NULL and + /// the expr tree (if created) will be closed. Error status will be returned too. + static Status create(const TExpr& texpr, const RowDescriptor& row_desc, + RuntimeState* state, ObjectPool* pool, Expr** expr, MemTracker* tracker); + + /// Create a new ScalarExpr based on thrift Expr 'texpr'. The newly created ScalarExpr + /// is stored in ObjectPool 'state->obj_pool()' and returned in 'expr'. 'row_desc' is + /// the tuple row descriptor of the input tuple row. Returns error status on failure. + static Status create(const TExpr& texpr, const RowDescriptor& row_desc, + RuntimeState* state, Expr** expr, MemTracker* tracker); + + /// Convenience functions creating multiple ScalarExpr. + static Status create(const std::vector& texprs, const RowDescriptor& row_desc, + RuntimeState* state, ObjectPool* pool, std::vector* exprs, MemTracker* tracker); + + /// Convenience functions creating multiple ScalarExpr. + static Status create(const std::vector& texprs, const RowDescriptor& row_desc, + RuntimeState* state, std::vector* exprs, MemTracker* tracker); + /// Convenience function for preparing multiple expr trees. /// Allocations from 'ctxs' will be counted against 'tracker'. static Status prepare(const std::vector& ctxs, RuntimeState* state, @@ -218,6 +246,9 @@ public: /// Convenience function for closing multiple expr trees. static void close(const std::vector& ctxs, RuntimeState* state); + /// Convenience functions for closing a list of ScalarExpr. + static void close(const std::vector& exprs); + // Computes a memory efficient layout for storing the results of evaluating 'exprs' // Returns the number of bytes necessary to store all the results and offsets // where the result for each expr should be stored. @@ -247,6 +278,12 @@ public: /// GetCodegendComputeFn(). int inline_constants(LlvmCodeGen* codegen, llvm::Function* fn); + /// Assigns indices into the FunctionContext vector 'fn_ctxs_' in an evaluator to + /// nodes which need FunctionContext in the tree. 'next_fn_ctx_idx' is the index + /// of the next available entry in the vector. It's updated as this function is + /// called recursively down the tree. + void assign_fn_ctx_idx(int* next_fn_ctx_idx); + virtual std::string debug_string() const; static std::string debug_string(const std::vector& exprs); static std::string debug_string(const std::vector& ctxs); @@ -298,6 +335,13 @@ protected: friend class ScalarFnCall; friend class HllHashFunction; + /// Constructs an Expr tree from the thrift Expr 'texpr'. 'root' is the root of the + /// Expr tree created from texpr.nodes[0] by the caller (either ScalarExpr or AggFn). + /// The newly created Expr nodes are added to 'pool'. Returns error status on failure. + static Status create_tree(const TExpr& texpr, ObjectPool* pool, Expr* root); + + int fn_ctx_idx() const { return _fn_ctx_idx; } + Expr(const TypeDescriptor& type); Expr(const TypeDescriptor& type, bool is_slotref); Expr(const TExprNode& node); @@ -346,6 +390,9 @@ protected: ExprContext* context, FunctionContext::FunctionStateScope scope); + /// Releases cache entries to LibCache in all nodes of the Expr tree. + virtual void close(); + /// Helper function that calls ctx->Register(), sets fn_context_index_, and returns the /// registered FunctionContext. FunctionContext* register_function_context( @@ -470,6 +517,35 @@ private: // Helper function for InlineConstants(). Returns the IR version of what GetConstant() // would return. llvm::Value* get_ir_constant(LlvmCodeGen* codegen, ExprConstant c, int i); + + /// Creates an expression tree rooted at 'root' via depth-first traversal. + /// Called recursively to create children expr trees for sub-expressions. + /// + /// parameters: + /// nodes: vector of thrift expression nodes to be unpacked. + /// It is essentially an Expr tree encoded in a depth-first manner. + /// pool: Object pool in which Expr created from nodes are stored. + /// root: root of the new tree. Created and initialized by the caller. + /// child_node_idx: index into 'nodes' to be unpacked. It's the root of the next child + /// child Expr tree to be added to 'root'. Updated as 'nodes' are + /// consumed to construct the tree. + /// return + /// status.ok() if successful + /// !status.ok() if tree is inconsistent or corrupt + static Status create_tree_internal(const std::vector& nodes, + ObjectPool* pool, Expr* parent, int* child_node_idx); + + /// 'fn_ctx_idx_' is the index into the FunctionContext vector in ScalarExprEvaluator + /// for storing FunctionContext needed to evaluate this ScalarExprNode. It's -1 if this + /// ScalarExpr doesn't need a FunctionContext. The FunctionContext is managed by the + /// evaluator and initialized by calling ScalarExpr::OpenEvaluator(). + int _fn_ctx_idx = -1; + + /// [fn_ctx_idx_start_, fn_ctx_idx_end_) defines the range in FunctionContext vector + /// in ScalarExpeEvaluator for the expression subtree rooted at this ScalarExpr node. + int _fn_ctx_idx_start = 0; + int _fn_ctx_idx_end = 0; + }; inline bool Expr::evaluate(VectorizedRowBatch* batch) { diff --git a/be/src/exprs/expr_context.cpp b/be/src/exprs/expr_context.cpp index 594a3962c8..fab3f21fd2 100644 --- a/be/src/exprs/expr_context.cpp +++ b/be/src/exprs/expr_context.cpp @@ -26,10 +26,12 @@ #include "exprs/expr.h" #include "exprs/slot_ref.h" #include "runtime/mem_pool.h" +#include "runtime/mem_tracker.h" #include "runtime/runtime_state.h" #include "runtime/raw_value.h" #include "udf/udf_internal.h" #include "util/debug_util.h" +#include "exprs/anyval_util.h" namespace palo { @@ -76,6 +78,14 @@ Status ExprContext::open(RuntimeState* state) { return _root->open(state, this, scope); } +// TODO chenhao , replace ExprContext with ScalarExprEvaluator +Status ExprContext::open(std::vector evals, RuntimeState* state) { + for (int i = 0; i < evals.size(); ++i) { + RETURN_IF_ERROR(evals[i]->open(state)); + } + return Status::OK; +} + void ExprContext::close(RuntimeState* state) { DCHECK(!_closed); FunctionContext::FunctionStateScope scope = @@ -444,4 +454,53 @@ DecimalVal ExprContext::get_decimal_val(TupleRow* row) { return _root->get_decimal_val(this, row); } +Status ExprContext::get_const_value(RuntimeState* state, Expr& expr, + AnyVal** const_val) { + DCHECK(_opened); + if (!expr.is_constant()) { + *const_val = nullptr; + return Status::OK; + } + + // A constant expression shouldn't have any SlotRefs expr in it. + DCHECK_EQ(expr.get_slot_ids(nullptr), 0); + DCHECK(_pool != nullptr); + const TypeDescriptor& result_type = expr.type(); + ObjectPool* obj_pool = state->obj_pool(); + *const_val = create_any_val(obj_pool, result_type); + if (*const_val == NULL) { + return Status("Could not create any val"); + } + + const void* result = ExprContext::get_value(&expr, nullptr); + AnyValUtil::set_any_val(result, result_type, *const_val); + if (result_type.is_string_type()) { + StringVal* sv = reinterpret_cast(*const_val); + if (!sv->is_null && sv->len > 0) { + // Make sure the memory is owned by this evaluator. + char* ptr_copy = reinterpret_cast(_pool->try_allocate(sv->len)); + if (ptr_copy == nullptr) { + return _pool->mem_tracker()->MemLimitExceeded( + state, "Could not allocate constant string value", sv->len); + } + memcpy(ptr_copy, sv->ptr, sv->len); + sv->ptr = reinterpret_cast(ptr_copy); + } + } + return get_error(expr._fn_ctx_idx_start, expr._fn_ctx_idx_end); +} + + +Status ExprContext::get_error(int start_idx, int end_idx) const { + DCHECK(_opened); + end_idx = end_idx == -1 ? _fn_contexts.size() : end_idx; + DCHECK_GE(start_idx, 0); + DCHECK_LE(end_idx, _fn_contexts.size()); + for (int idx = start_idx; idx < end_idx; ++idx) { + DCHECK_LT(idx, _fn_contexts.size()); + FunctionContext* fn_ctx = _fn_contexts[idx]; + if (fn_ctx->has_error()) return Status(fn_ctx->error_msg()); + } + return Status::OK; +} } diff --git a/be/src/exprs/expr_context.h b/be/src/exprs/expr_context.h index 580522c197..2ef80fa80b 100644 --- a/be/src/exprs/expr_context.h +++ b/be/src/exprs/expr_context.h @@ -62,6 +62,9 @@ public: /// reinitializing function state). Status open(RuntimeState* state); + //TODO chenhao + static Status open(std::vector input_evals, RuntimeState* state); + /// Creates a copy of this ExprContext. Open() must be called first. The copy contains /// clones of each FunctionContext, which share the fragment-local state of the /// originals but have their own MemPool and thread-local state. Clone() should be used @@ -151,6 +154,24 @@ public: static const char* _s_llvm_class_name; + bool opened() { + return _opened; + } + + /// If 'expr' is constant, evaluates it with no input row argument and returns the + /// result in 'const_val'. Sets 'const_val' to NULL if the argument is not constant. + /// The returned AnyVal and associated varlen data is owned by this evaluator. This + /// should only be called after Open() has been called on this expr. Returns an error + /// if there was an error evaluating the expression or if memory could not be allocated + /// for the expression result. + Status get_const_value(RuntimeState* state, Expr& expr, AnyVal** const_val); + + /// Returns an error status if there was any error in evaluating the expression + /// or its sub-expressions. 'start_idx' and 'end_idx' correspond to the range + /// within the vector of FunctionContext for the sub-expressions of interest. + /// The default parameters correspond to the entire expr 'root_'. + Status get_error(int start_idx, int end_idx) const; + private: friend class Expr; friend class ScalarFnCall; diff --git a/be/src/exprs/hybird_set.h b/be/src/exprs/hybird_set.h index c27b63094f..c7994362f5 100644 --- a/be/src/exprs/hybird_set.h +++ b/be/src/exprs/hybird_set.h @@ -39,6 +39,8 @@ public: } virtual void insert(void* data) = 0; + virtual void insert(HybirdSetBase* set) = 0; + virtual int size() = 0; virtual bool find(void* data) = 0; @@ -55,7 +57,6 @@ public: }; virtual IteratorBase* begin() = 0; - }; template @@ -68,7 +69,19 @@ public: } virtual void insert(void* data) { - _set.insert(*reinterpret_cast(data)); + if (sizeof(T) >= 16) { + // for largeint, it will core dump with no memcpy + T value; + memcpy(&value, data, sizeof(T)); + _set.insert(value); + } else { + _set.insert(*reinterpret_cast(data)); + } + } + + virtual void insert(HybirdSetBase* set) { + HybirdSet* hybird_set = reinterpret_cast*>(set); + _set.insert(hybird_set->_set.begin(), hybird_set->_set.end()); } virtual int size() { @@ -114,7 +127,9 @@ public: IteratorBase* begin() { return _pool.add(new(std::nothrow) Iterator(_set.begin(), _set.end())); } + private: + std::unordered_set _set; ObjectPool _pool; }; @@ -133,6 +148,11 @@ public: _set.insert(str_value); } + void insert(HybirdSetBase* set) { + StringValueSet* string_set = reinterpret_cast(set); + _set.insert(string_set->_set.begin(), string_set->_set.end()); + } + virtual int size() { return _set.size(); } @@ -179,7 +199,9 @@ public: IteratorBase* begin() { return _pool.add(new(std::nothrow) Iterator(_set.begin(), _set.end())); } + private: + std::unordered_set _set; ObjectPool _pool; }; diff --git a/be/src/exprs/in_predicate.h b/be/src/exprs/in_predicate.h index ca8e329208..847c2783c8 100644 --- a/be/src/exprs/in_predicate.h +++ b/be/src/exprs/in_predicate.h @@ -25,7 +25,6 @@ #include #include #include "exprs/predicate.h" -#include "exec/hash_join_node.h" #include "runtime/raw_value.h" #include "exprs/hybird_set.h" diff --git a/be/src/exprs/new_agg_fn_evaluator.cc b/be/src/exprs/new_agg_fn_evaluator.cc new file mode 100644 index 0000000000..81eb2950c3 --- /dev/null +++ b/be/src/exprs/new_agg_fn_evaluator.cc @@ -0,0 +1,597 @@ +// Modifications copyright (C) 2017, Baidu.com, Inc. +// Copyright 2017 The Apache Software Foundation + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "exprs/new_agg_fn_evaluator.h" + +#include + +#include "codegen/llvm_codegen.h" +#include "common/logging.h" +#include "exprs/aggregate_functions.h" +#include "exprs/agg_fn.h" +#include "exprs/anyval_util.h" +#include "exprs/expr_context.h" +#include "exprs/expr.h" +#include "exprs/scalar_fn_call.h" +#include "gutil/strings/substitute.h" +#include "runtime/lib_cache.h" +#include "runtime/mem_tracker.h" +#include "runtime/raw_value.h" +#include "runtime/runtime_state.h" +#include "runtime/string_value.h" +#include "udf/udf_internal.h" +#include "util/debug_util.h" + +#include + +#include "common/names.h" + +using namespace palo; +using namespace palo_udf; +using namespace llvm; +using std::move; + +// typedef for builtin aggregate functions. Unfortunately, these type defs don't +// really work since the actual builtin is implemented not in terms of the base +// AnyVal* type. Due to this, there are lots of casts when we use these typedefs. +// TODO: these typedefs exists as wrappers to go from (TupleRow, Tuple) to the +// types the aggregation functions need. This needs to be done with codegen instead. +typedef void (*InitFn)(FunctionContext*, AnyVal*); +typedef void (*UpdateFn0)(FunctionContext*, AnyVal*); +typedef void (*UpdateFn1)(FunctionContext*, const AnyVal&, AnyVal*); +typedef void (*UpdateFn2)(FunctionContext*, const AnyVal&, const AnyVal&, AnyVal*); +typedef void (*UpdateFn3)(FunctionContext*, const AnyVal&, const AnyVal&, + const AnyVal&, AnyVal*); +typedef void (*UpdateFn4)(FunctionContext*, const AnyVal&, const AnyVal&, + const AnyVal&, const AnyVal&, AnyVal*); +typedef void (*UpdateFn5)(FunctionContext*, const AnyVal&, const AnyVal&, + const AnyVal&, const AnyVal&, const AnyVal&, AnyVal*); +typedef void (*UpdateFn6)(FunctionContext*, const AnyVal&, const AnyVal&, + const AnyVal&, const AnyVal&, const AnyVal&, const AnyVal&, AnyVal*); +typedef void (*UpdateFn7)(FunctionContext*, const AnyVal&, const AnyVal&, + const AnyVal&, const AnyVal&, const AnyVal&, const AnyVal&, const AnyVal&, AnyVal*); +typedef void (*UpdateFn8)(FunctionContext*, const AnyVal&, const AnyVal&, + const AnyVal&, const AnyVal&, const AnyVal&, const AnyVal&, const AnyVal&, + const AnyVal&, AnyVal*); +typedef StringVal (*SerializeFn)(FunctionContext*, const StringVal&); +typedef AnyVal (*GetValueFn)(FunctionContext*, const AnyVal&); +typedef AnyVal (*FinalizeFn)(FunctionContext*, const AnyVal&); + +const char* NewAggFnEvaluator::LLVM_CLASS_NAME = "class.impala::NewAggFnEvaluator"; +const int DEFAULT_MULTI_DISTINCT_COUNT_STRING_BUFFER_SIZE = 1024; + +NewAggFnEvaluator::NewAggFnEvaluator(const AggFn& agg_fn, MemPool* mem_pool, MemTracker* tracker, bool is_clone) + : _total_mem_consumption(0), + _accumulated_mem_consumption(0), + is_clone_(is_clone), + agg_fn_(agg_fn), + mem_pool_(mem_pool), + _mem_tracker(tracker) { +} + +NewAggFnEvaluator::~NewAggFnEvaluator() { + if (UNLIKELY(_total_mem_consumption > 0)) { + _mem_tracker->release(_total_mem_consumption); + } + DCHECK(closed_); +} + +const SlotDescriptor& NewAggFnEvaluator::intermediate_slot_desc() const { + return agg_fn_.intermediate_slot_desc(); +} + +const TypeDescriptor& NewAggFnEvaluator::intermediate_type() const { + return agg_fn_.intermediate_type(); +} + +Status NewAggFnEvaluator::Create(const AggFn& agg_fn, RuntimeState* state, ObjectPool* pool, + MemPool* mem_pool, NewAggFnEvaluator** result, + MemTracker* tracker, const RowDescriptor& row_desc) { + *result = nullptr; + + // Create a new AggFn evaluator. + NewAggFnEvaluator* agg_fn_eval = pool->add(new NewAggFnEvaluator(agg_fn, mem_pool, tracker, false)); + + agg_fn_eval->agg_fn_ctx_.reset(FunctionContextImpl::create_context(state, mem_pool, + agg_fn.GetIntermediateTypeDesc(), agg_fn.GetOutputTypeDesc(), + agg_fn.arg_type_descs(), 0, false)); + + Status status; + // Create the evaluators for the input expressions. + for (Expr* input_expr : agg_fn.children()) { + // TODO chenhao replace ExprContext with ScalarFnEvaluator + ExprContext* input_eval = pool->add(new ExprContext(input_expr)); + if (input_eval == nullptr) goto cleanup; + input_eval->prepare(state, row_desc, tracker); + agg_fn_eval->input_evals_.push_back(input_eval); + Expr* root = input_eval->root(); + DCHECK(root == input_expr); + AnyVal* staging_input_val; + status = allocate_any_val(state, mem_pool, input_expr->type(), + "Could not allocate aggregate expression input value", &staging_input_val); + agg_fn_eval->staging_input_vals_.push_back(staging_input_val); + if (UNLIKELY(!status.ok())) goto cleanup; + } + DCHECK_EQ(agg_fn.get_num_children(), agg_fn_eval->input_evals_.size()); + DCHECK_EQ(agg_fn_eval->staging_input_vals_.size(), agg_fn_eval->input_evals_.size()); + + status = allocate_any_val(state, mem_pool, agg_fn.intermediate_type(), + "Could not allocate aggregate expression intermediate value", + &(agg_fn_eval->staging_intermediate_val_)); + if (UNLIKELY(!status.ok())) goto cleanup; + status = allocate_any_val(state, mem_pool, agg_fn.intermediate_type(), + "Could not allocate aggregate expression merge input value", + &(agg_fn_eval->staging_merge_input_val_)); + if (UNLIKELY(!status.ok())) goto cleanup; + + if (agg_fn.is_merge()) { + DCHECK_EQ(agg_fn_eval->staging_input_vals_.size(), 1) + << "Merge should only have 1 input."; + } + + *result = agg_fn_eval; + return Status::OK; + +cleanup: + DCHECK(!status.ok()); + agg_fn_eval->Close(state); + return status; +} + +Status NewAggFnEvaluator::Create(const vector& agg_fns, RuntimeState* state, + ObjectPool* pool, MemPool* mem_pool, vector* evals, + MemTracker* tracker, const RowDescriptor& row_desc) { + for (const AggFn* agg_fn : agg_fns) { + NewAggFnEvaluator* agg_fn_eval; + RETURN_IF_ERROR(NewAggFnEvaluator::Create(*agg_fn, state, pool, mem_pool, + &agg_fn_eval, tracker, row_desc)); + evals->push_back(agg_fn_eval); + } + return Status::OK; +} + +Status NewAggFnEvaluator::Open(RuntimeState* state) { + if (opened_) return Status::OK; + opened_ = true; + // TODO chenhao, ScalarFnEvaluator different from ExprContext + RETURN_IF_ERROR(ExprContext::open(input_evals_, state)); + // Now that we have opened all our input exprs, it is safe to evaluate any constant + // values for the UDA's FunctionContext (we cannot evaluate exprs before calling Open() + // on them). + vector constant_args(input_evals_.size(), nullptr); + for (int i = 0; i < input_evals_.size(); ++i) { + ExprContext* eval = input_evals_[i]; + RETURN_IF_ERROR(eval->get_const_value(state, *(agg_fn_.get_child(i)), + &constant_args[i])); + } + agg_fn_ctx_->impl()->set_constant_args(move(constant_args)); + return Status::OK; +} + +Status NewAggFnEvaluator::Open( + const vector& evals, RuntimeState* state) { + for (NewAggFnEvaluator* eval : evals) RETURN_IF_ERROR(eval->Open(state)); + return Status::OK; +} + +void NewAggFnEvaluator::Close(RuntimeState* state) { + if (closed_) return; + closed_ = true; + if (!is_clone_) Expr::close(input_evals_, state); + // TODO chenhao + //FreeLocalAllocations(); + agg_fn_ctx_->impl()->close(); + agg_fn_ctx_.reset(); + + //TODO chenhao release ExprContext + //for (int i = 0; i < input_evals_.size(); i++) { + // ExprContext* context = input_evals_[i]; + // delete context; + //} + input_evals_.clear(); +} + + +void NewAggFnEvaluator::Close( + const vector& evals, RuntimeState* state) { + for (NewAggFnEvaluator* eval : evals) eval->Close(state); +} + +void NewAggFnEvaluator::SetDstSlot(const AnyVal* src, const SlotDescriptor& dst_slot_desc, + Tuple* dst) { + if (src->is_null) { + dst->set_null(dst_slot_desc.null_indicator_offset()); + return; + } + + dst->set_not_null(dst_slot_desc.null_indicator_offset()); + void* slot = dst->get_slot(dst_slot_desc.tuple_offset()); + switch (dst_slot_desc.type().type) { + case TYPE_NULL: + return; + case TYPE_BOOLEAN: + *reinterpret_cast(slot) = reinterpret_cast(src)->val; + return; + case TYPE_TINYINT: + *reinterpret_cast(slot) = reinterpret_cast(src)->val; + return; + case TYPE_SMALLINT: + *reinterpret_cast(slot) = reinterpret_cast(src)->val; + return; + case TYPE_INT: + *reinterpret_cast(slot) = reinterpret_cast(src)->val; + return; + case TYPE_BIGINT: + *reinterpret_cast(slot) = reinterpret_cast(src)->val; + return; + case TYPE_LARGEINT: + memcpy(slot, &reinterpret_cast(src)->val, sizeof(__int128)); + return; + case TYPE_FLOAT: + *reinterpret_cast(slot) = reinterpret_cast(src)->val; + return; + case TYPE_DOUBLE: + *reinterpret_cast(slot) = reinterpret_cast(src)->val; + return; + case TYPE_CHAR: + case TYPE_VARCHAR: + case TYPE_HLL: + *reinterpret_cast(slot) = + StringValue::from_string_val(*reinterpret_cast(src)); + return; + case TYPE_DATE: + case TYPE_DATETIME: + *reinterpret_cast(slot) = DateTimeValue::from_datetime_val( + *reinterpret_cast(src)); + return; + case TYPE_DECIMAL: + *reinterpret_cast(slot) = DecimalValue::from_decimal_val( + *reinterpret_cast(src)); + return; + default: + DCHECK(false) << "NYI: " << dst_slot_desc.type(); + } +} + +// This function would be replaced in codegen. +void NewAggFnEvaluator::Init(Tuple* dst) { + DCHECK(opened_); + DCHECK(agg_fn_.init_fn_ != nullptr); + for (ExprContext* input_eval : input_evals_) { + DCHECK(input_eval->opened()); + } + + const TypeDescriptor& type = intermediate_type(); + const SlotDescriptor& slot_desc = intermediate_slot_desc(); + if (type.type == TYPE_CHAR) { + // The intermediate value is represented as a fixed-length buffer inline in the tuple. + // The aggregate function writes to this buffer directly. staging_intermediate_val_ + // is a StringVal with a pointer to the slot and the length of the slot. + void* slot = dst->get_slot(slot_desc.tuple_offset()); + StringVal* sv = reinterpret_cast(staging_intermediate_val_); + sv->is_null = dst->is_null(slot_desc.null_indicator_offset()); + sv->ptr = reinterpret_cast(slot); + sv->len = type.len; + } + reinterpret_cast(agg_fn_.init_fn_)( + agg_fn_ctx_.get(), staging_intermediate_val_); + SetDstSlot(staging_intermediate_val_, slot_desc, dst); + agg_fn_ctx_->impl()->set_num_updates(0); + agg_fn_ctx_->impl()->set_num_removes(0); +} + +static void SetAnyVal(const SlotDescriptor& desc, Tuple* tuple, AnyVal* dst) { + bool is_null = tuple->is_null(desc.null_indicator_offset()); + void* slot = nullptr; + if (!is_null) slot = tuple->get_slot(desc.tuple_offset()); + AnyValUtil::set_any_val(slot, desc.type(), dst); +} + + +// Utility to put val into an AnyVal struct +inline void NewAggFnEvaluator::set_any_val( + const void* slot, + const TypeDescriptor& type, AnyVal* dst) { + if (slot == NULL) { + dst->is_null = true; + return; + } + + dst->is_null = false; + + switch (type.type) { + case TYPE_NULL: + return; + + case TYPE_BOOLEAN: + reinterpret_cast(dst)->val = *reinterpret_cast(slot); + return; + + case TYPE_TINYINT: + reinterpret_cast(dst)->val = *reinterpret_cast(slot); + return; + + case TYPE_SMALLINT: + reinterpret_cast(dst)->val = *reinterpret_cast(slot); + return; + + case TYPE_INT: + reinterpret_cast(dst)->val = *reinterpret_cast(slot); + return; + + case TYPE_BIGINT: + reinterpret_cast(dst)->val = *reinterpret_cast(slot); + return; + + case TYPE_FLOAT: + reinterpret_cast(dst)->val = *reinterpret_cast(slot); + return; + + case TYPE_DOUBLE: + reinterpret_cast(dst)->val = *reinterpret_cast(slot); + return; + + case TYPE_CHAR: + case TYPE_VARCHAR: + case TYPE_HLL: + reinterpret_cast(slot)->to_string_val( + reinterpret_cast(dst)); + return; + + case TYPE_DATE: + case TYPE_DATETIME: + reinterpret_cast(slot)->to_datetime_val( + reinterpret_cast(dst)); + return; + + case TYPE_DECIMAL: + reinterpret_cast(slot)->to_decimal_val( + reinterpret_cast(dst)); + return; + + case TYPE_LARGEINT: + memcpy(&reinterpret_cast(dst)->val, slot, sizeof(__int128)); + return; + + default: + DCHECK(false) << "NYI"; + } +} + +void NewAggFnEvaluator::Update(const TupleRow* row, Tuple* dst, void* fn) { + if (fn == nullptr) return; + + const SlotDescriptor& slot_desc = intermediate_slot_desc(); + SetAnyVal(slot_desc, dst, staging_intermediate_val_); + for (int i = 0; i < input_evals_.size(); ++i) { + void* src_slot = input_evals_[i]->get_value(const_cast(row)); + DCHECK(input_evals_[i]->root() == agg_fn_.get_child(i)); + AnyValUtil::set_any_val(src_slot, agg_fn_.get_child(i)->type(), staging_input_vals_[i]); + } + + // TODO: this part is not so good and not scalable. It can be replaced with + // codegen but we can also consider leaving it for the first few cases for + // debugging. + switch (input_evals_.size()) { + case 0: + reinterpret_cast(fn)(agg_fn_ctx_.get(), staging_intermediate_val_); + break; + case 1: + reinterpret_cast(fn)(agg_fn_ctx_.get(), + *staging_input_vals_[0], staging_intermediate_val_); + break; + case 2: + reinterpret_cast(fn)(agg_fn_ctx_.get(), + *staging_input_vals_[0], *staging_input_vals_[1], staging_intermediate_val_); + break; + case 3: + reinterpret_cast(fn)(agg_fn_ctx_.get(), + *staging_input_vals_[0], *staging_input_vals_[1], + *staging_input_vals_[2], staging_intermediate_val_); + break; + case 4: + reinterpret_cast(fn)(agg_fn_ctx_.get(), + *staging_input_vals_[0], *staging_input_vals_[1], + *staging_input_vals_[2], *staging_input_vals_[3], staging_intermediate_val_); + break; + case 5: + reinterpret_cast(fn)(agg_fn_ctx_.get(), + *staging_input_vals_[0], *staging_input_vals_[1], + *staging_input_vals_[2], *staging_input_vals_[3], + *staging_input_vals_[4], staging_intermediate_val_); + break; + case 6: + reinterpret_cast(fn)(agg_fn_ctx_.get(), + *staging_input_vals_[0], *staging_input_vals_[1], + *staging_input_vals_[2], *staging_input_vals_[3], + *staging_input_vals_[4], *staging_input_vals_[5], staging_intermediate_val_); + break; + case 7: + reinterpret_cast(fn)(agg_fn_ctx_.get(), + *staging_input_vals_[0], *staging_input_vals_[1], + *staging_input_vals_[2], *staging_input_vals_[3], + *staging_input_vals_[4], *staging_input_vals_[5], + *staging_input_vals_[6], staging_intermediate_val_); + break; + case 8: + reinterpret_cast(fn)(agg_fn_ctx_.get(), + *staging_input_vals_[0], *staging_input_vals_[1], + *staging_input_vals_[2], *staging_input_vals_[3], + *staging_input_vals_[4], *staging_input_vals_[5], + *staging_input_vals_[6], *staging_input_vals_[7], + staging_intermediate_val_); + break; + default: + DCHECK(false) << "NYI"; + } + SetDstSlot(staging_intermediate_val_, slot_desc, dst); +} + +void NewAggFnEvaluator::Merge(Tuple* src, Tuple* dst) { + DCHECK(agg_fn_.merge_fn_ != nullptr); + const SlotDescriptor& slot_desc = intermediate_slot_desc(); + SetAnyVal(slot_desc, dst, staging_intermediate_val_); + SetAnyVal(slot_desc, src, staging_merge_input_val_); + // The merge fn always takes one input argument. + reinterpret_cast(agg_fn_.merge_fn_)(agg_fn_ctx_.get(), + *staging_merge_input_val_, staging_intermediate_val_); + SetDstSlot(staging_intermediate_val_, slot_desc, dst); +} + +void NewAggFnEvaluator::SerializeOrFinalize(Tuple* src, + const SlotDescriptor& dst_slot_desc, Tuple* dst, void* fn) { + // No fn was given and the src and dst are identical. Nothing to be done. + if (fn == nullptr && src == dst) return; + // src != dst means we are performing a Finalize(), so even if fn == null we + // still must copy the value of the src slot into dst. + + const SlotDescriptor& slot_desc = intermediate_slot_desc(); + bool src_slot_null = src->is_null(slot_desc.null_indicator_offset()); + void* src_slot = nullptr; + if (!src_slot_null) src_slot = src->get_slot(slot_desc.tuple_offset()); + + // No fn was given but the src and dst tuples are different (doing a Finalize()). + // Just copy the src slot into the dst tuple. + if (fn == nullptr) { + DCHECK_EQ(intermediate_type(), dst_slot_desc.type()); + RawValue::write(src_slot, dst, &dst_slot_desc, nullptr); + return; + } + + AnyValUtil::set_any_val(src_slot, intermediate_type(), staging_intermediate_val_); + switch (dst_slot_desc.type().type) { + case TYPE_BOOLEAN: { + typedef BooleanVal(*Fn)(FunctionContext*, AnyVal*); + BooleanVal v = reinterpret_cast(fn)( + agg_fn_ctx_.get(), staging_intermediate_val_); + SetDstSlot(&v, dst_slot_desc, dst); + break; + } + case TYPE_TINYINT: { + typedef TinyIntVal(*Fn)(FunctionContext*, AnyVal*); + TinyIntVal v = reinterpret_cast(fn)( + agg_fn_ctx_.get(), staging_intermediate_val_); + SetDstSlot(&v, dst_slot_desc, dst); + break; + } + case TYPE_SMALLINT: { + typedef SmallIntVal(*Fn)(FunctionContext*, AnyVal*); + SmallIntVal v = reinterpret_cast(fn)( + agg_fn_ctx_.get(), staging_intermediate_val_); + SetDstSlot(&v, dst_slot_desc, dst); + break; + } + case TYPE_INT: { + typedef IntVal(*Fn)(FunctionContext*, AnyVal*); + IntVal v = reinterpret_cast(fn)( + agg_fn_ctx_.get(), staging_intermediate_val_); + SetDstSlot(&v, dst_slot_desc, dst); + break; + } + case TYPE_BIGINT: { + typedef BigIntVal(*Fn)(FunctionContext*, AnyVal*); + BigIntVal v = reinterpret_cast(fn)( + agg_fn_ctx_.get(), staging_intermediate_val_); + SetDstSlot(&v, dst_slot_desc, dst); + break; + } + case TYPE_LARGEINT: { + typedef LargeIntVal(*Fn)(FunctionContext*, AnyVal*); + LargeIntVal v = reinterpret_cast(fn)( + agg_fn_ctx_.get(), staging_intermediate_val_); + SetDstSlot(&v, dst_slot_desc, dst); + break; + } + case TYPE_FLOAT: { + typedef FloatVal(*Fn)(FunctionContext*, AnyVal*); + FloatVal v = reinterpret_cast(fn)( + agg_fn_ctx_.get(), staging_intermediate_val_); + SetDstSlot(&v, dst_slot_desc, dst); + break; + } + case TYPE_DOUBLE: { + typedef DoubleVal(*Fn)(FunctionContext*, AnyVal*); + DoubleVal v = reinterpret_cast(fn)( + agg_fn_ctx_.get(), staging_intermediate_val_); + SetDstSlot(&v, dst_slot_desc, dst); + break; + } + case TYPE_CHAR: + case TYPE_VARCHAR: + case TYPE_HLL:{ + typedef StringVal(*Fn)(FunctionContext*, AnyVal*); + StringVal v = reinterpret_cast(fn)( + agg_fn_ctx_.get(), staging_intermediate_val_); + SetDstSlot(&v, dst_slot_desc, dst); + break; + } + case TYPE_DECIMAL: { + typedef DecimalVal(*Fn)(FunctionContext*, AnyVal*); + DecimalVal v = reinterpret_cast(fn)( + agg_fn_ctx_.get(), staging_intermediate_val_); + SetDstSlot(&v, dst_slot_desc, dst); + break; + } + case TYPE_DATE: + case TYPE_DATETIME: { + typedef DateTimeVal(*Fn)(FunctionContext*, AnyVal*); + DateTimeVal v = reinterpret_cast(fn)( + agg_fn_ctx_.get(), staging_intermediate_val_); + SetDstSlot(&v, dst_slot_desc, dst); + break; + } + default: + DCHECK(false) << "NYI"; + } +} + +void NewAggFnEvaluator::ShallowClone(ObjectPool* pool, MemPool* mem_pool, + NewAggFnEvaluator** cloned_eval) const { + DCHECK(opened_); + *cloned_eval = pool->add(new NewAggFnEvaluator(agg_fn_, mem_pool, _mem_tracker, true)); + (*cloned_eval)->agg_fn_ctx_.reset(agg_fn_ctx_->impl()->clone(mem_pool)); + DCHECK_EQ((*cloned_eval)->input_evals_.size(), 0); + (*cloned_eval)->input_evals_ = input_evals_; + (*cloned_eval)->staging_input_vals_ = staging_input_vals_; + (*cloned_eval)->staging_intermediate_val_ = staging_intermediate_val_; + (*cloned_eval)->staging_merge_input_val_ = staging_merge_input_val_; + (*cloned_eval)->opened_ = true; +} + +void NewAggFnEvaluator::ShallowClone(ObjectPool* pool, MemPool* mem_pool, + const vector& evals, + vector* cloned_evals) { + for (const NewAggFnEvaluator* eval : evals) { + NewAggFnEvaluator* cloned_eval; + eval->ShallowClone(pool, mem_pool, &cloned_eval); + cloned_evals->push_back(cloned_eval); + } +} + +// +//void NewAggFnEvaluator::FreeLocalAllocations() { +// ExprContext::FreeLocalAllocations(input_evals_); +// agg_fn_ctx_->impl()->FreeLocalAllocations(); +//} + +//void NewAggFnEvaluator::FreeLocalAllocations(const vector& evals) { +// for (NewAggFnEvaluator* eval : evals) eval->FreeLocalAllocations(); +//} + diff --git a/be/src/exprs/new_agg_fn_evaluator.h b/be/src/exprs/new_agg_fn_evaluator.h new file mode 100644 index 0000000000..69e5e84007 --- /dev/null +++ b/be/src/exprs/new_agg_fn_evaluator.h @@ -0,0 +1,345 @@ +// Modifications copyright (C) 2017, Baidu.com, Inc. +// Copyright 2017 The Apache Software Foundation + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef IMPALA_EXPRS_AGG_FN_EVALUATOR_H +#define IMPALA_EXPRS_AGG_FN_EVALUATOR_H + +#include + +#include +#include +#include "codegen/palo_ir.h" +#include "common/compiler_util.h" +#include "common/status.h" +#include "exprs/agg_fn.h" +#include "exprs/hybird_map.h" +#include "runtime/descriptors.h" +#include "runtime/lib_cache.h" +#include "runtime/tuple_row.h" +#include "runtime/types.h" +#include "udf/udf.h" +#include "udf/udf_internal.h" + +#include "gen_cpp/Exprs_types.h" +#include "gen_cpp/PlanNodes_types.h" +#include "gen_cpp/Types_types.h" + +namespace palo { + +class MemPool; +class MemTracker; +class ObjectPool; +class RowDescriptor; +class RuntimeState; +class SlotDescriptor; +class Tuple; +class TupleRow; +class TExprNode; +class ExprContext; + +/// NewAggFnEvaluator is the interface for evaluating aggregate functions during execution. +/// +/// NewAggFnEvaluator contains runtime state and implements wrapper functions which convert +/// the input TupleRow into AnyVal format expected by UDAF functions defined in AggFn. +/// It also evaluates TupleRow against input expressions, stores the results in staging +/// input values which are passed to Update() function to update the intermediate value +/// and handles the merging of intermediate values in the merge phases of execution. +/// +/// This class is not threadsafe. An evaluator can be cloned to isolate resource +/// consumption per partition in an aggregation node. +/// +class NewAggFnEvaluator { + public: + /// Creates an NewAggFnEvaluator object from the aggregate expression 'agg_fn'. + /// The evaluator is added to 'pool' and returned in 'eval'. This will also + /// create a single evaluator for each input expression. All allocations will come + /// from 'mem_pool'. Note that it's the responsibility to call Close() all evaluators + /// even if this function returns error status on initialization failure. + static Status Create(const AggFn& agg_fn, RuntimeState* state, ObjectPool* pool, + MemPool* mem_pool, NewAggFnEvaluator** eval, MemTracker* tracker, + const RowDescriptor& row_desc) WARN_UNUSED_RESULT; + + /// Convenience functions for creating evaluators for multiple aggregate functions. + static Status Create(const std::vector& agg_fns, RuntimeState* state, + ObjectPool* pool, MemPool* mem_pool, std::vector* evals, + MemTracker* tracker, const RowDescriptor& row_desc) WARN_UNUSED_RESULT; + + ~NewAggFnEvaluator(); + + /// Initializes the evaluator by calling Open() on all the input expressions' evaluators + /// and caches all constant input arguments. + /// TODO: Move the evaluation of constant input arguments to AggFn setup. + Status Open(RuntimeState* state) WARN_UNUSED_RESULT; + + /// Convenience functions for opening multiple NewAggFnEvaluators. + static Status Open(const std::vector& evals, + RuntimeState* state) WARN_UNUSED_RESULT; + + /// Used by PartitionedAggregation node to initialize one evaluator per partition. + /// Avoid the overhead of re-initializing an evaluator (e.g. calling GetConstVal() + /// on the input expressions). Cannot be called until after Open() has been called. + /// 'cloned_eval' is a shallow copy of this evaluator: all input values, staging + /// intermediate values and merge values are shared with the original evaluator. Only + /// the FunctionContext 'agg_fn_ctx' is cloned for resource isolation per partition. + /// So, it's not safe to use cloned evaluators concurrently. + void ShallowClone( + ObjectPool* pool, MemPool* mem_pool, NewAggFnEvaluator** cloned_eval) const; + + /// Convenience function for cloning multiple evaluators. The newly cloned evaluators + /// are appended to 'cloned_evals'. + static void ShallowClone(ObjectPool* pool, MemPool* mem_pool, + const std::vector& evals, + std::vector* cloned_evals); + + /// Free resources owned by the evaluator. + void Close(RuntimeState* state); + static void Close(const std::vector& evals, RuntimeState* state); + + const AggFn& agg_fn() const { return agg_fn_; } + + FunctionContext* IR_ALWAYS_INLINE agg_fn_ctx() const; + + ExprContext* const* IR_ALWAYS_INLINE input_evals() const; + + /// Call the initialization function of the AggFn. May update 'dst'. + void Init(Tuple* dst); + + /// Updates the intermediate state dst based on adding the input src row. This can be + /// called either to drive the UDA's Update() or Merge() function, depending on whether + /// the AggFn is a merging aggregation. + void Add(const TupleRow* src, Tuple* dst); + + /// Updates the intermediate state dst to remove the input src row, i.e. undo + /// Add(src, dst). Only used internally for analytic fn builtins. + void Remove(const TupleRow* src, Tuple* dst); + + /// Explicitly does a merge, even if this evaluator is not marked as merging. + /// This is used by the partitioned agg node when it needs to merge spill results. + /// In the non-spilling case, this node would normally not merge. + void Merge(Tuple* src, Tuple* dst); + + /// Flattens any intermediate values containing pointers, and frees any memory + /// allocated during the init, update and merge phases. + void Serialize(Tuple* dst); + + /// Does one final transformation of the aggregated value in 'agg_val' and stores the + /// result in 'output_val'. Also frees the resources allocated during init, update and + /// merge phases. + void Finalize(Tuple* agg_val, Tuple* output_val); + + /// Puts the finalized value from Tuple* src in Tuple* dst just as Finalize() does. + /// However, unlike Finalize(), GetValue() does not clean up state in src. + /// GetValue() can be called repeatedly with the same src. Only used internally for + /// analytic fn builtins. Note that StringVal result is from local allocation (which + /// will be freed in the next QueryMaintenance()) so it needs to be copied out if it + /// needs to survive beyond QueryMaintenance() (e.g. if 'dst' lives in a row batch). + void GetValue(Tuple* src, Tuple* dst); + + // TODO: implement codegen path. These functions would return IR functions with + // the same signature as the interpreted ones above. + // Function* GetIrInitFn(); + // Function* GetIrUpdateFn(); + // Function* GetIrMergeFn(); + // Function* GetIrSerializeFn(); + // Function* GetIrFinalizeFn(); + static const size_t TINYINT_SIZE = sizeof(int8_t); + static const size_t SMALLINT_SIZE = sizeof(int16_t); + static const size_t INT_SIZE = sizeof(int32_t); + static const size_t BIGINT_SIZE = sizeof(int64_t); + static const size_t FLOAT_SIZE = sizeof(float); + static const size_t DOUBLE_SIZE = sizeof(double); + static const size_t DECIMAL_SIZE = sizeof(DecimalValue); + static const size_t TIME_DURATION_SIZE = sizeof(boost::posix_time::time_duration); + static const size_t DATE_SIZE = sizeof(boost::gregorian::date); + static const size_t LARGEINT_SIZE = sizeof(__int128); + + // DATETIME VAL has two part: packet_time is 8 byte, and type is 4 byte + // MySQL packet time : int64_t packed_time; + // Indicate which type of this value : int type; + static const size_t DATETIME_SIZE = 16; + + bool is_multi_distinct() { + return _is_multi_distinct; + } + + const std::vector& input_expr_ctxs() const { + return input_evals_; + } + + /// Helper functions for calling the above functions on many evaluators. + static void Init(const std::vector& evals, Tuple* dst); + static void Add(const std::vector& evals, const TupleRow* src, + Tuple* dst); + static void Remove(const std::vector& evals, + const TupleRow* src, Tuple* dst); + static void Serialize(const std::vector& evals, + Tuple* dst); + static void GetValue(const std::vector& evals, Tuple* src, + Tuple* dst); + static void Finalize(const std::vector& evals, Tuple* src, + Tuple* dst); + + /// Free local allocations made in UDA functions and input arguments' evals. + //void FreeLocalAllocations(); + //static void FreeLocalAllocations(const std::vector& evals); + + std::string DebugString() const; + static std::string DebugString(const std::vector& evals); + + static const char* LLVM_CLASS_NAME; + + private: + + uint64_t _total_mem_consumption; + uint64_t _accumulated_mem_consumption; + + // index if has multi count distinct + bool _is_multi_distinct; + + /// True if the evaluator has been initialized. + bool opened_ = false; + + /// True if the evaluator has been closed. + bool closed_ = false; + + /// True if this evaluator is created from a ShallowClone() call. + const bool is_clone_; + + const AggFn& agg_fn_; + + /// Pointer to the MemPool which all allocations come from. + /// Owned by the exec node which owns this evaluator. + MemPool* mem_pool_ = nullptr; + + MemTracker* _mem_tracker; // saved c'tor param + + /// This contains runtime state such as constant input arguments to the aggregate + /// functions and a FreePool from which the intermediate values are allocated. + /// Owned by this evaluator. + boost::scoped_ptr agg_fn_ctx_; + + /// Evaluators for input expressions for this aggregate function. + /// Empty if there is no input expression (e.g. count(*)). + std::vector input_evals_; + + /// Staging input values used by the interpreted Update() / Merge() paths. + /// It stores the evaluation results of input expressions to be passed to the + /// Update() / Merge() function. + std::vector staging_input_vals_; + + /// Staging intermediate and merged values used in the interpreted + /// Update() / Merge() paths. + palo_udf::AnyVal* staging_intermediate_val_ = nullptr; + palo_udf::AnyVal* staging_merge_input_val_ = nullptr; + + /// Use Create() instead. + NewAggFnEvaluator(const AggFn& agg_fn, MemPool* mem_pool, MemTracker* tracker, bool is_clone); + + /// Return the intermediate type of the aggregate function. + inline const SlotDescriptor& intermediate_slot_desc() const; + inline const TypeDescriptor& intermediate_type() const; + + /// The interpreted path for the UDA's Update() function. It sets up the arguments to + /// call 'fn' is either the 'update_fn_' or 'merge_fn_' of agg_fn_, depending on whether + /// agg_fn_ is a merging aggregation. This converts from the agg-expr signature, taking + /// TupleRow to the UDA signature taking AnyVals by evaluating any input expressions + /// and populating the staging input values. + /// + /// Note that this function may be superseded by the codegend Update() IR function + /// generated by AggFn::CodegenUpdateOrMergeFunction() when codegen is enabled. + void Update(const TupleRow* row, Tuple* dst, void* fn); + + /// Sets up the arguments to call 'fn'. This converts from the agg-expr signature, + /// taking TupleRow to the UDA signature taking AnyVals. Writes the serialize/finalize + /// result to the given destination slot/tuple. 'fn' can be NULL to indicate the src + /// value should simply be written into the destination. Note that StringVal result is + /// from local allocation (which will be freed in the next QueryMaintenance()) so it + /// needs to be copied out if it needs to survive beyond QueryMaintenance() (e.g. if + /// 'dst' lives in a row batch). + void SerializeOrFinalize(Tuple* src, const SlotDescriptor& dst_slot_desc, + Tuple* dst, void* fn); + + /// Writes the result in src into dst pointed to by dst_slot_desc + inline void SetDstSlot( + const palo_udf::AnyVal* src, const SlotDescriptor& dst_slot_desc, Tuple* dst); + + // Sets 'dst' to the value from 'slot'. + void set_any_val(const void* slot, const TypeDescriptor& type, palo_udf::AnyVal* dst); +}; + +inline void NewAggFnEvaluator::Add(const TupleRow* row, Tuple* dst) { + agg_fn_ctx_->impl()->increment_num_updates(); + Update(row, dst, agg_fn_.merge_or_update_fn()); +} + +inline void NewAggFnEvaluator::Remove(const TupleRow* row, Tuple* dst) { + agg_fn_ctx_->impl()->increment_num_removes(); + Update(row, dst, agg_fn_.remove_fn()); +} + +inline void NewAggFnEvaluator::Serialize(Tuple* tuple) { + SerializeOrFinalize(tuple, agg_fn_.intermediate_slot_desc(), tuple, + agg_fn_.serialize_fn()); +} + +inline void NewAggFnEvaluator::Finalize(Tuple* agg_val, Tuple* output_val) { + SerializeOrFinalize(agg_val, agg_fn_.output_slot_desc(), output_val, + agg_fn_.finalize_fn()); +} + +inline void NewAggFnEvaluator::GetValue(Tuple* src, Tuple* dst) { + SerializeOrFinalize(src, agg_fn_.output_slot_desc(), dst, + agg_fn_.get_value_fn()); +} + +inline void NewAggFnEvaluator::Init(const std::vector& evals, Tuple* dst) { + for (int i = 0; i < evals.size(); ++i) evals[i]->Init(dst); +} + +inline void NewAggFnEvaluator::Add(const std::vector& evals, + const TupleRow* src, Tuple* dst) { + for (int i = 0; i < evals.size(); ++i) evals[i]->Add(src, dst); +} + +inline void NewAggFnEvaluator::Remove(const std::vector& evals, + const TupleRow* src, Tuple* dst) { + for (int i = 0; i < evals.size(); ++i) evals[i]->Remove(src, dst); +} + +inline void NewAggFnEvaluator::Serialize(const std::vector& evals, + Tuple* dst) { + for (int i = 0; i < evals.size(); ++i) evals[i]->Serialize(dst); +} + +inline void NewAggFnEvaluator::GetValue(const std::vector& evals, + Tuple* src, Tuple* dst) { + for (int i = 0; i < evals.size(); ++i) evals[i]->GetValue(src, dst); +} + +inline void NewAggFnEvaluator::Finalize(const std::vector& evals, + Tuple* agg_val, Tuple* output_val) { + for (int i = 0; i < evals.size(); ++i) { + evals[i]->Finalize(agg_val, output_val); + } +} + +} + +#endif diff --git a/be/src/exprs/new_agg_fn_evaluator_ir.cc b/be/src/exprs/new_agg_fn_evaluator_ir.cc new file mode 100644 index 0000000000..9d118989c8 --- /dev/null +++ b/be/src/exprs/new_agg_fn_evaluator_ir.cc @@ -0,0 +1,31 @@ +// Modifications copyright (C) 2017, Baidu.com, Inc. +// Copyright 2017 The Apache Software Foundation + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "exprs/new_agg_fn_evaluator.h" + +using namespace palo; + +FunctionContext* NewAggFnEvaluator::agg_fn_ctx() const { + return agg_fn_ctx_.get(); +} + +ExprContext* const* NewAggFnEvaluator::input_evals() const { + return input_evals_.data(); +} diff --git a/be/src/exprs/scalar_fn_call.cpp b/be/src/exprs/scalar_fn_call.cpp index a7a0988357..76f5945668 100644 --- a/be/src/exprs/scalar_fn_call.cpp +++ b/be/src/exprs/scalar_fn_call.cpp @@ -423,7 +423,7 @@ Status ScalarFnCall::get_udf(RuntimeState* state, Function** udf) { _fn.scalar_fn.symbol.find("add_sub") != std::string::npos; if (_fn.binary_type == TFunctionBinaryType::NATIVE || (_fn.binary_type == TFunctionBinaryType::BUILTIN - && (!state->codegen_level() > 0 || broken_builtin))) { + && (!(state->codegen_level() > 0) || broken_builtin))) { // In this path, we are code that has been statically compiled to assembly. // This can either be a UDF implemented in a .so or a builtin using the UDF // interface with the code in impalad. diff --git a/be/src/exprs/slot_ref.cpp b/be/src/exprs/slot_ref.cpp index 1c979c3f2b..e1019f2843 100644 --- a/be/src/exprs/slot_ref.cpp +++ b/be/src/exprs/slot_ref.cpp @@ -26,6 +26,7 @@ #include "codegen/llvm_codegen.h" #include "gen_cpp/Exprs_types.h" #include "runtime/runtime_state.h" +#include "util/types.h" using llvm::BasicBlock; using llvm::Constant; @@ -467,7 +468,7 @@ LargeIntVal SlotRef::get_large_int_val(ExprContext* context, TupleRow* row) { if (t == NULL || t->is_null(_null_indicator_offset)) { return LargeIntVal::null(); } - return LargeIntVal(*reinterpret_cast<__int128*>(t->get_slot(_slot_offset))); + return LargeIntVal(reinterpret_cast(t->get_slot(_slot_offset))->value); } FloatVal SlotRef::get_float_val(ExprContext* context, TupleRow* row) { diff --git a/be/src/exprs/string_functions.cpp b/be/src/exprs/string_functions.cpp index 90b8298c7c..0c7ea61b9d 100644 --- a/be/src/exprs/string_functions.cpp +++ b/be/src/exprs/string_functions.cpp @@ -89,7 +89,9 @@ StringVal StringFunctions::space(FunctionContext* context, const IntVal& len) { return StringVal(); } int32_t space_size = std::min(len.val, 65535); - StringVal result = StringVal::create_temp_string_val(context, space_size); + // TODO pengyubing + // StringVal result = StringVal::create_temp_string_val(context, space_size); + StringVal result(context, space_size); memset(result.ptr, ' ', space_size); return result; } @@ -102,7 +104,10 @@ StringVal StringFunctions::repeat( if (str.len == 0 || n.val <= 0) { return StringVal(); } - StringVal result = StringVal::create_temp_string_val(context, str.len * n.val); + + // TODO pengyubing + // StringVal result = StringVal::create_temp_string_val(context, str.len * n.val); + StringVal result(context, str.len * n.val); if (UNLIKELY(result.is_null)) { return result; } @@ -127,7 +132,9 @@ StringVal StringFunctions::lpad( return StringVal(str.ptr, len.val); } - StringVal result = StringVal::create_temp_string_val(context, len.val); + // TODO pengyubing + // StringVal result = StringVal::create_temp_string_val(context, len.val); + StringVal result(context, len.val); if (result.is_null) { return result; } @@ -160,7 +167,9 @@ StringVal StringFunctions::rpad( return StringVal(str.ptr, len.val); } - StringVal result = StringVal::create_temp_string_val(context, len.val); + // TODO pengyubing + // StringVal result = StringVal::create_temp_string_val(context, len.val); + StringVal result(context, len.val); if (UNLIKELY(result.is_null)) { return result; } @@ -191,7 +200,9 @@ StringVal StringFunctions::lower(FunctionContext* context, const StringVal& str) if (str.is_null) { return StringVal::null(); } - StringVal result = StringVal::create_temp_string_val(context, str.len); + // TODO pengyubing + // StringVal result = StringVal::create_temp_string_val(context, str.len); + StringVal result(context, str.len); if (UNLIKELY(result.is_null)) { return result; } @@ -205,7 +216,9 @@ StringVal StringFunctions::upper(FunctionContext* context, const StringVal& str) if (str.is_null) { return StringVal::null(); } - StringVal result = StringVal::create_temp_string_val(context, str.len); + // TODO pengyubing + // StringVal result = StringVal::create_temp_string_val(context, str.len); + StringVal result(context, str.len); if (UNLIKELY(result.is_null)) { return result; } @@ -219,7 +232,10 @@ StringVal StringFunctions::reverse(FunctionContext* context, const StringVal& st if (str.is_null) { return StringVal::null(); } - StringVal result = StringVal::create_temp_string_val(context, str.len); + + // TODO pengyubing + // StringVal result = StringVal::create_temp_string_val(context, str.len); + StringVal result(context, str.len); if (UNLIKELY(result.is_null)) { return result; } @@ -515,7 +531,10 @@ StringVal StringFunctions::concat_ws( } total_size += sep.len + strs[i].len; } - StringVal result = StringVal::create_temp_string_val(context, total_size); + + // TODO pengyubing + // StringVal result = StringVal::create_temp_string_val(context, total_size); + StringVal result(context, total_size); uint8_t* ptr = result.ptr; // Loop again to append the data. diff --git a/be/src/gen_cpp/CMakeLists.txt b/be/src/gen_cpp/CMakeLists.txt index 9f9155019c..b2352c838a 100644 --- a/be/src/gen_cpp/CMakeLists.txt +++ b/be/src/gen_cpp/CMakeLists.txt @@ -71,6 +71,10 @@ set(SRC_FILES ${GEN_CPP_DIR}/olap_common.pb.cc ${GEN_CPP_DIR}/olap_file.pb.cc ${GEN_CPP_DIR}/column_data_file.pb.cc + ${GEN_CPP_DIR}/data.pb.cc + ${GEN_CPP_DIR}/internal_service.pb.cc + ${GEN_CPP_DIR}/types.pb.cc + ${GEN_CPP_DIR}/status.pb.cc #$${GEN_CPP_DIR}/opcode/functions.cc #$${GEN_CPP_DIR}/opcode/vector-functions.cc #$${GEN_CPP_DIR}/opcode/opcode-registry-init.cc diff --git a/be/src/gutil/CMakeLists.txt b/be/src/gutil/CMakeLists.txt index 6ed15f075c..fbc93517ab 100644 --- a/be/src/gutil/CMakeLists.txt +++ b/be/src/gutil/CMakeLists.txt @@ -61,7 +61,7 @@ add_library(Gutil STATIC walltime.cc) set_target_properties(Gutil PROPERTIES COMPILE_FLAGS "-funsigned-char -Wno-deprecated -Wno-char-subscripts") -target_link_libraries(Gutil glog protobuf rt) +# target_link_libraries(Gutil glog protobuf rt) #set(GUTIL_LIBS # glog diff --git a/be/src/gutil/atomicops-internals-x86.cc b/be/src/gutil/atomicops-internals-x86.cc index f02edc6401..23824e36ce 100644 --- a/be/src/gutil/atomicops-internals-x86.cc +++ b/be/src/gutil/atomicops-internals-x86.cc @@ -59,12 +59,12 @@ // Set the flags so that code will run correctly and conservatively // until InitGoogle() is called. -struct AtomicOps_x86CPUFeatureStruct AtomicOps_Internalx86CPUFeatures = { +struct GutilAtomicOps_x86CPUFeatureStruct GutilAtomicOps_Internalx86CPUFeatures = { false, // no SSE2 false // no cmpxchg16b }; -// Initialize the AtomicOps_Internalx86CPUFeatures struct. +// Initialize the GutilAtomicOps_Internalx86CPUFeatures struct. static void AtomicOps_Internalx86CPUFeaturesInit() { uint32 eax; uint32 ebx; @@ -90,16 +90,16 @@ static void AtomicOps_Internalx86CPUFeaturesInit() { } // edx bit 26 is SSE2 which we use to tell use whether we can use mfence - AtomicOps_Internalx86CPUFeatures.has_sse2 = ((edx >> 26) & 1); + GutilAtomicOps_Internalx86CPUFeatures.has_sse2 = ((edx >> 26) & 1); // ecx bit 13 indicates whether the cmpxchg16b instruction is supported - AtomicOps_Internalx86CPUFeatures.has_cmpxchg16b = ((ecx >> 13) & 1); + GutilAtomicOps_Internalx86CPUFeatures.has_cmpxchg16b = ((ecx >> 13) & 1); VLOG(1) << "vendor " << vendor << " family " << family << " model " << model << - " sse2 " << AtomicOps_Internalx86CPUFeatures.has_sse2 << - " cmpxchg16b " << AtomicOps_Internalx86CPUFeatures.has_cmpxchg16b; + " sse2 " << GutilAtomicOps_Internalx86CPUFeatures.has_sse2 << + " cmpxchg16b " << GutilAtomicOps_Internalx86CPUFeatures.has_cmpxchg16b; } // AtomicOps initialisation routine for external use. diff --git a/be/src/gutil/atomicops-internals-x86.h b/be/src/gutil/atomicops-internals-x86.h index 641aaafaa9..5c41356bfa 100644 --- a/be/src/gutil/atomicops-internals-x86.h +++ b/be/src/gutil/atomicops-internals-x86.h @@ -42,11 +42,12 @@ // use it. // Features of this x86. Values may not be correct before InitGoogle() is run, // but are set conservatively. -struct AtomicOps_x86CPUFeatureStruct { +// Modify AtomicOps_x86CPUFeatureStruct to GutilAtomicOps_x86CPUFeatureStruct for brpc +struct GutilAtomicOps_x86CPUFeatureStruct { bool has_sse2; // Processor has SSE2. bool has_cmpxchg16b; // Processor supports cmpxchg16b instruction. }; -extern struct AtomicOps_x86CPUFeatureStruct AtomicOps_Internalx86CPUFeatures; +extern struct GutilAtomicOps_x86CPUFeatureStruct GutilAtomicOps_Internalx86CPUFeatures; #define ATOMICOPS_COMPILER_BARRIER() __asm__ __volatile__("" : : : "memory") @@ -179,7 +180,7 @@ inline void Acquire_Store(volatile Atomic32* ptr, Atomic32 value) { #else inline void MemoryBarrier() { - if (AtomicOps_Internalx86CPUFeatures.has_sse2) { + if (GutilAtomicOps_Internalx86CPUFeatures.has_sse2) { __asm__ __volatile__("mfence" : : : "memory"); } else { // mfence is faster but not present on PIII Atomic32 x = 0; @@ -188,7 +189,7 @@ inline void MemoryBarrier() { } inline void Acquire_Store(volatile Atomic32* ptr, Atomic32 value) { - if (AtomicOps_Internalx86CPUFeatures.has_sse2) { + if (GutilAtomicOps_Internalx86CPUFeatures.has_sse2) { CheckNaturalAlignment(ptr); *ptr = value; __asm__ __volatile__("mfence" : : : "memory"); diff --git a/be/src/gutil/dynamic_annotations.h b/be/src/gutil/dynamic_annotations.h index ce68d89121..dc2571f3b6 100644 --- a/be/src/gutil/dynamic_annotations.h +++ b/be/src/gutil/dynamic_annotations.h @@ -410,7 +410,7 @@ #if DYNAMIC_ANNOTATIONS_ENABLED == 0 #define ANNOTALYSIS_ONLY 1 #undef ANNOTALYSIS_STATIC_INLINE -#define ANNOTALYSIS_STATIC_INLINE static inline +#define ANNOTALYSIS_STATIC_INLINE inline //static inline #undef ANNOTALYSIS_SEMICOLON_OR_EMPTY_BODY #define ANNOTALYSIS_SEMICOLON_OR_EMPTY_BODY { (void)file; (void)line; } #endif @@ -452,7 +452,7 @@ #define CLANG_ANNOTALYSIS_ONLY 1 #undef ANNOTALYSIS_STATIC_INLINE -#define ANNOTALYSIS_STATIC_INLINE static inline +#define ANNOTALYSIS_STATIC_INLINE inline //static inline #undef ANNOTALYSIS_SEMICOLON_OR_EMPTY_BODY #define ANNOTALYSIS_SEMICOLON_OR_EMPTY_BODY { (void)file; (void)line; } diff --git a/be/src/gutil/logging-inl.h b/be/src/gutil/logging-inl.h index 409a99c0ad..6fce563661 100644 --- a/be/src/gutil/logging-inl.h +++ b/be/src/gutil/logging-inl.h @@ -41,10 +41,13 @@ // foo.CheckThatFoo(); // #endif // +// Modify this to macro to undefine this #ifdef NDEBUG -const bool DEBUG_MODE = false; +// const bool DEBUG_MODE = false; +#define DEBUG_MODE (false) #else -const bool DEBUG_MODE = true; +// const bool DEBUG_MODE = true; +#define DEBUG_MODE (true) #endif #endif // _LOGGING_IN_H_ diff --git a/be/src/http/CMakeLists.txt b/be/src/http/CMakeLists.txt index 29facde2b1..f97072b5d3 100644 --- a/be/src/http/CMakeLists.txt +++ b/be/src/http/CMakeLists.txt @@ -43,13 +43,14 @@ add_library(Webserver STATIC action/snapshot_action.cpp action/reload_tablet_action.cpp action/pprof_actions.cpp + action/metrics_action.cpp # action/multi_start.cpp # action/multi_show.cpp # action/multi_commit.cpp # action/multi_unload.cpp ) -target_link_libraries(Webserver pthread dl Util) +# target_link_libraries(Webserver pthread dl Util) #ADD_BE_TEST(integer-array-test) #ADD_BE_TEST(runtime-profile-test) #ADD_BE_TEST(benchmark-test) diff --git a/be/src/http/action/metrics_action.cpp b/be/src/http/action/metrics_action.cpp new file mode 100644 index 0000000000..5e295baefa --- /dev/null +++ b/be/src/http/action/metrics_action.cpp @@ -0,0 +1,95 @@ +// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "http/action/metrics_action.h" + +#include + +#include "http/http_request.h" +#include "http/http_response.h" +#include "http/http_channel.h" +#include "http/http_headers.h" +#include "http/webserver.h" +#include "runtime/exec_env.h" +#include "util/metrics.h" + +namespace palo { + +class PrometheusMetricsVisitor : public MetricsVisitor { +public: + virtual ~PrometheusMetricsVisitor() {} + void visit(const std::string& prefix, const std::string& name, + MetricCollector* collector) override; + std::string to_string() const { return _ss.str(); } +private: + void _visit_simple_metric( + const std::string& name, const MetricLabels& labels, SimpleMetric* metric); +private: + std::stringstream _ss; +}; + +void PrometheusMetricsVisitor::visit(const std::string& prefix, + const std::string& name, + MetricCollector* collector) { + if (collector->empty() || name.empty()) { + return; + } + std::string metric_name; + if (prefix.empty()) { + metric_name = name; + } else { + metric_name = prefix + "_" + name; + } + // Output metric type + _ss << "# TYPE " << metric_name << " " << collector->type() << "\n"; + switch (collector->type()) { + case MetricType::COUNTER: + case MetricType::GAUGE: + for (auto& it : collector->metrics()) { + _visit_simple_metric(metric_name, it.first, (SimpleMetric*)it.second); + } + break; + default: + break; + } +} + +void PrometheusMetricsVisitor::_visit_simple_metric( + const std::string& name, const MetricLabels& labels, SimpleMetric* metric) { + _ss << name; + // labels + if (!labels.empty()) { + _ss << "{"; + int i = 0; + for (auto& label : labels.labels) { + if (i++ > 0) { + _ss << ","; + } + _ss << label.name << "=\"" << label.value << "\""; + } + _ss << "}"; + } + _ss << " " << metric->to_string() << "\n"; +} + +void MetricsAction::handle(HttpRequest* req, HttpChannel* channel) { + PrometheusMetricsVisitor visitor; + _metrics->collect(&visitor); + std::string str = visitor.to_string(); + HttpResponse response(HttpStatus::OK, "text/plain; version=0.0.4", &str); + channel->send_response(response); +} + +} diff --git a/be/src/http/action/metrics_action.h b/be/src/http/action/metrics_action.h new file mode 100644 index 0000000000..8667ec9ae0 --- /dev/null +++ b/be/src/http/action/metrics_action.h @@ -0,0 +1,38 @@ +// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "http/http_handler.h" + +namespace palo { + +class Webserver; +class ExecEnv; +class HttpRequest; +class HttpChannel; +class MetricRegistry; + +class MetricsAction : public HttpHandler { +public: + MetricsAction(MetricRegistry* metrics) :_metrics(metrics) { } + virtual ~MetricsAction() { } + + void handle(HttpRequest *req, HttpChannel *channel) override; +private: + MetricRegistry* _metrics; +}; + +} diff --git a/be/src/http/action/pprof_actions.cpp b/be/src/http/action/pprof_actions.cpp index ddab483326..5598606913 100644 --- a/be/src/http/action/pprof_actions.cpp +++ b/be/src/http/action/pprof_actions.cpp @@ -52,7 +52,7 @@ public: }; void HeapAction::handle(HttpRequest* req, HttpChannel* channel) { -#ifdef ADDRESS_SANITIZER +#if defined(ADDRESS_SANITIZER) || defined(LEAK_SANITIZER) || defined(THREAD_SANITIZER) (void)kPprofDefaultSampleSecs; // Avoid unused variable warning. std::string str = "Heap profiling is not available with address sanitizer builds."; @@ -93,7 +93,7 @@ public: }; void GrowthAction::handle(HttpRequest* req, HttpChannel* channel) { -#ifdef ADDRESS_SANITIZER +#if defined(ADDRESS_SANITIZER) || defined(LEAK_SANITIZER) || defined(THREAD_SANITIZER) std::string str = "Growth profiling is not available with address sanitizer builds."; HttpResponse response(HttpStatus::OK, &str); channel->send_response(response); @@ -116,7 +116,7 @@ public: }; void ProfileAction::handle(HttpRequest *req, HttpChannel *channel) { -#ifdef ADDRESS_SANITIZER +#if defined(ADDRESS_SANITIZER) || defined(LEAK_SANITIZER) || defined(THREAD_SANITIZER) std::string str = "CPU profiling is not available with address sanitizer builds."; HttpResponse response(HttpStatus::OK, &str); channel->send_response(response); diff --git a/be/src/http/default_path_handlers.cpp b/be/src/http/default_path_handlers.cpp index 9c80608237..149e451f3e 100644 --- a/be/src/http/default_path_handlers.cpp +++ b/be/src/http/default_path_handlers.cpp @@ -26,6 +26,7 @@ #include "runtime/mem_tracker.h" #include "util/debug_util.h" #include "util/logging.h" +#include "util/pretty_printer.h" #include "http/web_page_handler.h" namespace palo { @@ -86,7 +87,7 @@ void mem_usage_handler(MemTracker* mem_tracker, const WebPageHandler::ArgumentMa } (*output) << "
";
-#ifdef ADDRESS_SANITIZER
+#if defined(ADDRESS_SANITIZER) || defined(LEAK_SANITIZER) || defined(THREAD_SANITIZER)
     (*output) << "Memory tracking is not available with address sanitizer builds.";
 #else
     char buf[2048];
diff --git a/be/src/http/download_action.h b/be/src/http/download_action.h
index dca1253688..bb8d35dc5a 100644
--- a/be/src/http/download_action.h
+++ b/be/src/http/download_action.h
@@ -19,6 +19,7 @@
 #include "exec/csv_scanner.h"
 #include "exec/scan_node.h"
 #include "runtime/descriptors.h"
+#include "http/http_handler.h"
 
 namespace palo {
 
diff --git a/be/src/http/http_channel.cpp b/be/src/http/http_channel.cpp
index 0c1fd50c74..7949dbaf64 100644
--- a/be/src/http/http_channel.cpp
+++ b/be/src/http/http_channel.cpp
@@ -63,6 +63,7 @@ void HttpChannel::send_response(const HttpResponse& response) {
     mg_printf(_mg_conn, "\r\n");
     if (contain_content) {
         mg_write(_mg_conn, content->c_str(), content->length());
+        _send_bytes += content->length();
     }
 }
 
@@ -105,6 +106,7 @@ void HttpChannel::send_response_content(const HttpResponse& response) {
     bool contain_content =  content != nullptr && !content->empty();
     if (contain_content) {
         mg_write(_mg_conn, content->c_str(), content->length());
+        _send_bytes += content->length();
     }
 }
 
@@ -113,6 +115,7 @@ void HttpChannel::append_response_content(
         const char* content,
         int32_t content_size) {
     mg_write(_mg_conn, content, content_size);
+    _send_bytes += content_size;
 }
 
 int HttpChannel::read(char* buf, int len) {
diff --git a/be/src/http/http_channel.h b/be/src/http/http_channel.h
index d80e4b3c72..81d083ef8c 100644
--- a/be/src/http/http_channel.h
+++ b/be/src/http/http_channel.h
@@ -52,10 +52,12 @@ public:
     // Helper maybe used everywhere
     void send_basic_challenge(const std::string& realm);
 
+    int64_t send_bytes() const { return _send_bytes; }
 private:
     const HttpRequest& _request;
     // save mongoose connection here
     mg_connection* _mg_conn;
+    int64_t _send_bytes = 0;
 };
 
 }
diff --git a/be/src/http/mongoose.cpp b/be/src/http/mongoose.cpp
index 3cc76fe4e8..8f1568a968 100644
--- a/be/src/http/mongoose.cpp
+++ b/be/src/http/mongoose.cpp
@@ -25,7 +25,9 @@
 #define _XOPEN_SOURCE 600     // For flockfile() on Linux
 #endif
 #define _LARGEFILE_SOURCE     // Enable 64-bit file offsets
+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS  //  wants this for C++
+#endif
 #define __STDC_LIMIT_MACROS   // C++ wants that for INT64_MAX
 #endif
 
@@ -1386,14 +1388,6 @@ static pid_t spawn_process(struct mg_connection *conn, const char *prog,
 }
 #endif // !NO_CGI
 
-static int set_non_blocking_mode(SOCKET sock) {
-  int flags;
-
-  flags = fcntl(sock, F_GETFL, 0);
-  (void) fcntl(sock, F_SETFL, flags | O_NONBLOCK);
-
-  return 0;
-}
 #endif // _WIN32
 
 // Write data to the IO channel - opened file descriptor, socket or SSL
@@ -3185,6 +3179,7 @@ static void handle_cgi_request(struct mg_connection *conn, const char *prog) {
   const char *status, *status_text;
   char buf[16384], *pbuf, dir[PATH_MAX], *p;
   struct mg_request_info ri;
+  ri.num_headers = 0;
   struct cgi_env_block blk;
   FILE *in, *out;
   pid_t pid;
@@ -4329,9 +4324,8 @@ static void reset_per_request_attributes(struct mg_connection *conn) {
 }
 
 static void close_socket_gracefully(struct mg_connection *conn) {
-  char buf[MG_BUF_LEN];
   struct linger linger;
-  int n, sock = conn->client.sock;
+  int sock = conn->client.sock;
 
   // Set linger option to avoid socket hanging out after close. This prevent
   // ephemeral port exhaust problem under high QPS.
@@ -4344,6 +4338,8 @@ static void close_socket_gracefully(struct mg_connection *conn) {
 // mongoose bug in Linux
 // only used in windows 
 #if defined(_WIN32)
+  char buf[MG_BUF_LEN];
+  int n = 0;
   set_non_blocking_mode(sock);
 
   // Read and discard pending incoming data. If we do not do that and close the
diff --git a/be/src/http/mongoose.h b/be/src/http/mongoose.h
index 42abdcb00b..6ab862aef1 100644
--- a/be/src/http/mongoose.h
+++ b/be/src/http/mongoose.h
@@ -29,19 +29,19 @@ struct mg_connection;  // Handle for the individual connection
 
 // This structure contains information about the HTTP request.
 struct mg_request_info {
-  char *request_method;  // "GET", "POST", etc
-  char *uri;             // URL-decoded URI
-  char *http_version;    // E.g. "1.0", "1.1"
-  char *query_string;    // URL part after '?' (not including '?') or NULL
-  char *remote_user;     // Authenticated user, or NULL if no auth used
-  long remote_ip;        // Client's IP address
-  int remote_port;       // Client's port
-  int is_ssl;            // 1 if SSL-ed, 0 if not
-  int num_headers;       // Number of headers
+  char *request_method = nullptr;  // "GET", "POST", etc
+  char *uri = nullptr;             // URL-decoded URI
+  char *http_version = nullptr;    // E.g. "1.0", "1.1"
+  char *query_string = nullptr;    // URL part after '?' (not including '?') or NULL
+  char *remote_user = nullptr;     // Authenticated user, or NULL if no auth used
+  long remote_ip;                  // Client's IP address
+  int remote_port;                 // Client's port
+  int is_ssl;                      // 1 if SSL-ed, 0 if not
+  int num_headers;                 // Number of headers
   struct mg_header {
-    char *name;          // HTTP header name
-    char *value;         // HTTP header value
-  } http_headers[64];    // Maximum 64 headers
+    char *name = nullptr;          // HTTP header name
+    char *value = nullptr;         // HTTP header value
+  } http_headers[64];              // Maximum 64 headers
 };
 
 
diff --git a/be/src/http/webserver.cpp b/be/src/http/webserver.cpp
index b5618c711a..41eba59b50 100644
--- a/be/src/http/webserver.cpp
+++ b/be/src/http/webserver.cpp
@@ -32,6 +32,8 @@
 #include "util/url_coding.h"
 #include "util/logging.h"
 #include "util/debug_util.h"
+#include "util/palo_metrics.h"
+#include "util/runtime_profile.h"
 #include "http/http_response.h"
 
 namespace palo {
@@ -212,6 +214,7 @@ void* Webserver::mongoose_callback_static(
     return nullptr;
 }
 
+class PaloMetrics;
 void* Webserver::mongoose_callback(struct mg_connection* mg_conn) {
     HttpRequest request(mg_conn);
     HttpChannel channel(request, mg_conn);
@@ -229,8 +232,14 @@ void* Webserver::mongoose_callback(struct mg_connection* mg_conn) {
         return PROCESSING_COMPLETE;
     }
 
-    // process
-    handler->handle(&request, &channel);
+    int64_t duration_ns = 0;
+    {
+        SCOPED_RAW_TIMER(&duration_ns);
+        handler->handle(&request, &channel);
+    }
+    PaloMetrics::http_requests_total.increment(1);
+    PaloMetrics::http_request_duration_us.increment(duration_ns / 1000);
+    PaloMetrics::http_request_send_bytes.increment(channel.send_bytes());
 
     // return code to mongoose
     return PROCESSING_COMPLETE;
diff --git a/be/src/olap/CMakeLists.txt b/be/src/olap/CMakeLists.txt
index a7329fb57a..2a8b9117cf 100644
--- a/be/src/olap/CMakeLists.txt
+++ b/be/src/olap/CMakeLists.txt
@@ -20,12 +20,19 @@ set(LIBRARY_OUTPUT_PATH "${BUILD_DIR}/src/olap")
 set(EXECUTABLE_OUTPUT_PATH "${BUILD_DIR}/src/olap")
   
 add_library(Olap STATIC
+    comparison_predicate.cpp
+    in_list_predicate.cpp
+    null_predicate.cpp
     olap_reader.cpp
-    base_expansion_handler.cpp
+    base_compaction.cpp
     command_executor.cpp
-    cumulative_handler.cpp
+    cumulative_compaction.cpp
     delete_handler.cpp
+    aggregate_func.cpp
+    types.cpp 
     field.cpp
+    field_info.cpp
+    hll.cpp
     file_helper.cpp
     i_data.cpp
     lru_cache.cpp
@@ -46,6 +53,7 @@ add_library(Olap STATIC
     row_cursor.cpp
     schema_change.cpp
     utils.cpp
+    wrapper_field.cpp
     writer.cpp
     column_file/bit_field_reader.cpp
     column_file/bit_field_writer.cpp
diff --git a/be/src/olap/aggregate_func.cpp b/be/src/olap/aggregate_func.cpp
new file mode 100644
index 0000000000..44d2b73048
--- /dev/null
+++ b/be/src/olap/aggregate_func.cpp
@@ -0,0 +1,151 @@
+// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "olap/aggregate_func.h"
+
+namespace palo {
+
+struct AggregateFuncMapHash {
+    size_t operator()(const std::pair& pair) const {
+        return (pair.first + 31) ^ pair.second;
+    }
+};
+
+class AggregateFuncResolver {
+DECLARE_SINGLETON(AggregateFuncResolver);
+public:
+    AggregateFunc get_aggregate_func(const FieldAggregationMethod agg_method,
+                                     const FieldType field_type) {
+        auto pair = _aggregate_mapping.find(std::make_pair(agg_method, field_type));
+        if (pair != _aggregate_mapping.end()) {
+            return pair->second;
+        } else {
+            return nullptr;
+        }
+    }
+
+    FinalizeFunc get_finalize_func(const FieldAggregationMethod agg_method,
+                                     const FieldType field_type) {
+        auto pair = _finalize_mapping.find(std::make_pair(agg_method, field_type));
+        if (pair != _finalize_mapping.end()) {
+            return pair->second;
+        } else {
+            return nullptr;
+        }
+    }
+
+    template
+    void add_aggregate_mapping() {
+        _aggregate_mapping.insert(std::make_pair(std::make_pair(agg_method, field_type),
+                         &AggregateFuncTraits::aggregate));
+    }
+
+    template
+    void add_finalize_mapping() {
+        _finalize_mapping.insert(std::make_pair(std::make_pair(agg_method, field_type),
+                         &AggregateFuncTraits::finalize));
+    }
+private:
+    typedef std::pair key_t;
+    std::unordered_map _aggregate_mapping;
+    std::unordered_map _finalize_mapping;
+
+    DISALLOW_COPY_AND_ASSIGN(AggregateFuncResolver);
+};
+
+AggregateFuncResolver::AggregateFuncResolver() {
+    // None Aggregate Function, no-ops
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+
+    // Min Aggregate Function
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+
+    // Max Aggregate Function
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+
+    // Sum Aggregate Function
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+
+    // Replace Aggregate Function
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+    add_aggregate_mapping();
+
+    // Hyperloglog Aggregate Function
+    add_aggregate_mapping();
+
+
+    // Finalize Function for hyperloglog Function
+    add_finalize_mapping();
+}
+
+AggregateFuncResolver::~AggregateFuncResolver() {}
+
+AggregateFunc get_aggregate_func(const FieldAggregationMethod agg_method,
+                                 const FieldType field_type) {
+    return AggregateFuncResolver::get_instance()->get_aggregate_func(agg_method, field_type);
+}
+
+FinalizeFunc get_finalize_func(const FieldAggregationMethod agg_method,
+                                 const FieldType field_type) {
+    return AggregateFuncResolver::get_instance()->get_finalize_func(agg_method, field_type);
+}
+
+} // namespace palo
diff --git a/be/src/olap/aggregate_func.h b/be/src/olap/aggregate_func.h
new file mode 100644
index 0000000000..2743eeea43
--- /dev/null
+++ b/be/src/olap/aggregate_func.h
@@ -0,0 +1,269 @@
+// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef BDG_PALO_BE_SRC_OLAP_AGGREGATE_FUNC_H
+#define BDG_PALO_BE_SRC_OLAP_AGGREGATE_FUNC_H
+
+#include "olap/field_info.h"
+#include "olap/hll.h"
+#include "olap/types.h"
+
+namespace palo {
+
+using AggregateFunc = void (*)(char* left, char* right);
+using FinalizeFunc = void (*)(char* data);
+
+template struct AggregateFuncTraits {};
+
+template
+struct AggregateFuncTraits {
+    static void aggregate(char* left, char* right) {}
+};
+
+template 
+struct AggregateFuncTraits {
+    static void aggregate(char* left, char* right) {
+        typedef typename FieldTypeTraits::CppType CppType;
+        bool l_null = *reinterpret_cast(left);
+        bool r_null = *reinterpret_cast(right);
+        if (l_null) {
+            return;
+        } else if (r_null) {
+            *reinterpret_cast(left) = true;
+        } else {
+            CppType* l_val = reinterpret_cast(left + 1);
+            CppType* r_val = reinterpret_cast(right + 1);
+            if (*r_val < *l_val) { *l_val = *r_val; }
+        }
+    }
+};
+
+template <>
+struct AggregateFuncTraits {
+    static void aggregate(char* left, char* right) {
+        typedef typename FieldTypeTraits::CppType CppType;
+        bool l_null = *reinterpret_cast(left);
+        bool r_null = *reinterpret_cast(right);
+        if (l_null) {
+            return;
+        } else if (r_null) {
+            *reinterpret_cast(left) = true;
+        } else {
+            CppType l_val, r_val;
+            memcpy(&l_val, left + 1, sizeof(CppType));
+            memcpy(&r_val, right + 1, sizeof(CppType));
+            if (r_val < l_val) {
+                memcpy(left + 1, right + 1, sizeof(CppType));
+            }
+        }
+    }
+};
+
+template 
+struct AggregateFuncTraits {
+    static void aggregate(char* left, char* right) {
+        typedef typename FieldTypeTraits::CppType CppType;
+        bool l_null = *reinterpret_cast(left);
+        bool r_null = *reinterpret_cast(right);
+        if (r_null) {
+            return;
+        }
+
+        CppType* l_val = reinterpret_cast(left + 1);
+        CppType* r_val = reinterpret_cast(right + 1);
+        if (l_null) {
+            *reinterpret_cast(left) = false;
+            *l_val = *r_val;
+        } else {
+            if (*r_val > *l_val) { *l_val = *r_val; }
+        }
+    }
+};
+
+template <>
+struct AggregateFuncTraits {
+    static void aggregate(char* left, char* right) {
+        typedef typename FieldTypeTraits::CppType CppType;
+        bool l_null = *reinterpret_cast(left);
+        bool r_null = *reinterpret_cast(right);
+        if (r_null) {
+            return;
+        }
+
+        if (l_null) {
+            *reinterpret_cast(left) = false;
+            memcpy(left + 1, right + 1, sizeof(CppType));
+        } else {
+            CppType l_val, r_val;
+            memcpy(&l_val, left + 1, sizeof(CppType));
+            memcpy(&r_val, right + 1, sizeof(CppType));
+            if (r_val > l_val) {
+                memcpy(left + 1, right + 1, sizeof(CppType));
+            }
+        }
+    }
+};
+
+template 
+struct AggregateFuncTraits {
+    static void aggregate(char* left, char* right) {
+        typedef typename FieldTypeTraits::CppType CppType;
+        bool l_null = *reinterpret_cast(left);
+        bool r_null = *reinterpret_cast(right);
+        if (r_null) {
+            return;
+        }
+
+        CppType* l_val = reinterpret_cast(left + 1);
+        CppType* r_val = reinterpret_cast(right + 1);
+        if (l_null) {
+            *reinterpret_cast(left) = false;
+            *l_val = *r_val;
+        } else {
+            *l_val += *r_val;
+        }
+    }
+};
+
+template <>
+struct AggregateFuncTraits {
+    static void aggregate(char* left, char* right) {
+        typedef typename FieldTypeTraits::CppType CppType;
+        bool l_null = *reinterpret_cast(left);
+        bool r_null = *reinterpret_cast(right);
+        if (r_null) {
+            return;
+        }
+
+        if (l_null) {
+            *reinterpret_cast(left) = false;
+            memcpy(left + 1, right + 1, sizeof(CppType));
+        } else {
+            CppType l_val, r_val;
+            memcpy(&l_val, left + 1, sizeof(CppType));
+            memcpy(&r_val, right + 1, sizeof(CppType));
+            l_val += r_val;
+            memcpy(left + 1, &l_val, sizeof(CppType));
+        }
+    }
+};
+
+template 
+struct AggregateFuncTraits {
+    static void aggregate(char* left, char* right) {
+        typedef typename FieldTypeTraits::CppType CppType;
+        bool r_null = *reinterpret_cast(right);
+        *reinterpret_cast(left) = r_null;
+
+        if (!r_null) {
+            CppType* l_val = reinterpret_cast(left + 1);
+            CppType* r_val = reinterpret_cast(right + 1);
+            *l_val = *r_val;
+        }
+    }
+};
+
+template <>
+struct AggregateFuncTraits {
+    static void aggregate(char* left, char* right) {
+        typedef typename FieldTypeTraits::CppType CppType;
+        bool r_null = *reinterpret_cast(right);
+        *reinterpret_cast(left) = r_null;
+
+        if (!r_null) {
+            memcpy(left + 1, right + 1, sizeof(CppType));
+        }
+    }
+};
+
+template <>
+struct AggregateFuncTraits {
+    static void aggregate(char* left, char* right) {
+        bool r_null = *reinterpret_cast(right);
+        *reinterpret_cast(left) = r_null;
+        if (!r_null) {
+            StringSlice* l_slice = reinterpret_cast(left + 1);
+            StringSlice* r_slice = reinterpret_cast(right + 1);
+            memory_copy(l_slice->data, r_slice->data, r_slice->size);
+            l_slice->size = r_slice->size;
+        }
+    }
+};
+
+template <>
+struct AggregateFuncTraits {
+    static void aggregate(char* left, char* right) {
+        //same with char aggregate
+        AggregateFuncTraits::aggregate(left, right);
+    }
+};
+
+template <>
+struct AggregateFuncTraits {
+    static void aggregate(char* left, char* right) {
+        StringSlice* l_slice = reinterpret_cast(left + 1);
+        size_t hll_ptr = *(size_t*)(l_slice->data - sizeof(HllContext*));
+        HllContext* context = (reinterpret_cast(hll_ptr));
+        HllSetHelper::fill_set(right + 1, context);
+    }
+    static void finalize(char* data) {
+        StringSlice* slice = reinterpret_cast(data);
+        size_t hll_ptr = *(size_t*)(slice->data - sizeof(HllContext*));
+        HllContext* context = (reinterpret_cast(hll_ptr));
+        std::map index_to_value;
+        if (context->has_sparse_or_full ||
+                context->hash64_set.size() > HLL_EXPLICLIT_INT64_NUM) {
+            HllSetHelper::set_max_register(context->registers, HLL_REGISTERS_COUNT,
+                                           context->hash64_set);
+            for (int i = 0; i < HLL_REGISTERS_COUNT; i++) {
+                if (context->registers[i] != 0) {
+                    index_to_value[i] = context->registers[i];
+                }
+            }
+        }
+        int sparse_set_len = index_to_value.size() *
+            (sizeof(HllSetResolver::SparseIndexType)
+             + sizeof(HllSetResolver::SparseValueType))
+            + sizeof(HllSetResolver::SparseLengthValueType);
+        int result_len = 0;
+
+        if (sparse_set_len >= HLL_COLUMN_DEFAULT_LEN) {
+            // full set
+            HllSetHelper::set_full(slice->data, context->registers,
+                                   HLL_REGISTERS_COUNT, result_len);
+        } else if (index_to_value.size() > 0) {
+            // sparse set
+            HllSetHelper::set_sparse(slice->data, index_to_value, result_len);
+        } else if (context->hash64_set.size() > 0) {
+            // expliclit set
+            HllSetHelper::set_expliclit(slice->data, context->hash64_set, result_len);
+        }
+
+        slice->size = result_len & 0xffff;
+
+        HllSetHelper::init_context(context);
+    }
+};
+
+extern AggregateFunc get_aggregate_func(const FieldAggregationMethod agg_method,
+                                        const FieldType field_type);
+extern FinalizeFunc get_finalize_func(const FieldAggregationMethod agg_method,
+                                      const FieldType field_type);
+
+} // namespace palo
+
+#endif // BDG_PALO_BE_SRC_OLAP_AGGREGATE_FUNC_H
diff --git a/be/src/olap/base_expansion_handler.cpp b/be/src/olap/base_compaction.cpp
similarity index 82%
rename from be/src/olap/base_expansion_handler.cpp
rename to be/src/olap/base_compaction.cpp
index 321a1cd6e5..09eb805898 100644
--- a/be/src/olap/base_expansion_handler.cpp
+++ b/be/src/olap/base_compaction.cpp
@@ -13,7 +13,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include "olap/base_expansion_handler.h"
+#include "olap/base_compaction.h"
 
 #include 
 #include 
@@ -38,42 +38,42 @@ using std::vector;
 
 namespace palo {
 
-OLAPStatus BaseExpansionHandler::init(SmartOLAPTable table, bool is_manual_trigger) {
+OLAPStatus BaseCompaction::init(SmartOLAPTable table, bool is_manual_trigger) {
     // 表在首次查询或PUSHç­‰æ“作时,会被加载到内存
     // å¦‚æžœè¡¨æ²¡æœ‰è¢«åŠ è½½ï¼Œè¡¨æ˜Žè¯¥è¡¨ä¸Šç›®å‰æ²¡æœ‰ä»»ä½•æ“作,所以ä¸è¿›è¡ŒBEæ“作
     if (!table->is_loaded()) {
         return OLAP_ERR_INPUT_PARAMETER_ERROR;
     }
 
-    OLAP_LOG_TRACE("init base expansion handler. [table=%s]", table->full_name().c_str());
+    OLAP_LOG_TRACE("init base compaction handler. [table=%s]", table->full_name().c_str());
 
     _table = table;
 
-    // 1. å°è¯•å–å¾—base expansionçš„é”
-    if (!_try_base_expansion_lock()) {
-        OLAP_LOG_WARNING("another base expansion is running. [table=%s]",
+    // 1. å°è¯•å–å¾—base compactionçš„é”
+    if (!_try_base_compaction_lock()) {
+        OLAP_LOG_WARNING("another base compaction is running. [table=%s]",
                          table->full_name().c_str());
         return OLAP_ERR_BE_TRY_BE_LOCK_ERROR;
     }
 
-    // 2. æ£€æŸ¥æ˜¯å¦æ»¡è¶³base expansion触å‘ç­–ç•¥
-    OLAP_LOG_TRACE("check whether satisfy base expansion policy.");
+    // 2. æ£€æŸ¥æ˜¯å¦æ»¡è¶³base compaction触å‘ç­–ç•¥
+    OLAP_LOG_TRACE("check whether satisfy base compaction policy.");
     bool is_policy_satisfied = false;
     vector candidate_versions;
     is_policy_satisfied = _check_whether_satisfy_policy(is_manual_trigger, &candidate_versions);
 
-    // 2.1 å¦‚æžœä¸æ»¡è¶³è§¦å‘策略,则直接释放base expansioné”, 返回错误ç 
+    // 2.1 å¦‚æžœä¸æ»¡è¶³è§¦å‘策略,则直接释放base compactioné”, 返回错误ç 
     if (!is_policy_satisfied) {
-        _release_base_expansion_lock();
+        _release_base_compaction_lock();
 
         return OLAP_ERR_BE_NO_SUITABLE_VERSION;
     }
 
-    // 2.2 如果满足触å‘策略,触å‘base expansion
-    //     ä¸é‡Šæ”¾base expansioné”, 在run()完æˆä¹‹åŽå†é‡Šæ”¾
+    // 2.2 如果满足触å‘策略,触å‘base compaction
+    //     ä¸é‡Šæ”¾base compactioné”, 在run()完æˆä¹‹åŽå†é‡Šæ”¾
     if (!_validate_need_merged_versions(candidate_versions)) {
         OLAP_LOG_FATAL("error! invalid need merged versions");
-        _release_base_expansion_lock();
+        _release_base_compaction_lock();
         return OLAP_ERR_BE_INVALID_NEED_MERGED_VERSIONS;
     }
 
@@ -82,8 +82,8 @@ OLAPStatus BaseExpansionHandler::init(SmartOLAPTable table, bool is_manual_trigg
     return OLAP_SUCCESS;
 }
 
-OLAPStatus BaseExpansionHandler::run() {
-    OLAP_LOG_INFO("start base expansion. [table=%s; old_base_version=%d; new_base_version=%d]",
+OLAPStatus BaseCompaction::run() {
+    OLAP_LOG_INFO("start base compaction. [table=%s; old_base_version=%d; new_base_version=%d]",
                   _table->full_name().c_str(),
                   _old_base_version.second,
                   _new_base_version.second);
@@ -91,7 +91,7 @@ OLAPStatus BaseExpansionHandler::run() {
     OLAPStatus res = OLAP_SUCCESS;
     OlapStopWatch stage_watch;
 
-    _table->set_base_expansion_status(BASE_EXPANSION_RUNNING, _new_base_version.second);
+    _table->set_base_compaction_status(BASE_COMPACTION_RUNNING, _new_base_version.second);
 
     // 1. 计算新base的version hash
     VersionHash new_base_version_hash;
@@ -117,13 +117,13 @@ OLAPStatus BaseExpansionHandler::run() {
         return OLAP_ERR_BE_ACQUIRE_DATA_SOURCES_ERROR;
     }
 
-    if (PaloMetrics::be_merge_delta_num() != NULL) {
-        PaloMetrics::be_merge_delta_num()->increment(_need_merged_versions.size());
-        int64_t merge_size = 0;
+    {
+        PaloMetrics::base_compaction_deltas_total.increment(_need_merged_versions.size());
+        int64_t merge_bytes = 0;
         for (IData* i_data : base_data_sources) {
-            merge_size += i_data->olap_index()->data_size();
+            merge_bytes += i_data->olap_index()->data_size();
         }
-        PaloMetrics::be_merge_size()->increment(merge_size);
+        PaloMetrics::base_compaction_bytes_total.increment(merge_bytes);
     }
 
     // ä¿å­˜ç”Ÿæˆbase文件时候计算的selectivities
@@ -131,10 +131,10 @@ OLAPStatus BaseExpansionHandler::run() {
     // ä¿å­˜ç”Ÿæˆbase文件时候累积的行数
     uint64_t row_count = 0;
 
-    // 3. 执行base expansion
+    // 3. 执行base compaction
     //    执行过程å¯èƒ½ä¼šæŒç»­æ¯”较长时间
     stage_watch.reset();
-    res = _do_base_expansion(new_base_version_hash,
+    res = _do_base_compaction(new_base_version_hash,
                              &base_data_sources,
                              &selectivities,
                              &row_count);
@@ -175,7 +175,7 @@ OLAPStatus BaseExpansionHandler::run() {
             }
 
             ++sleep_count;
-            OLAP_LOG_FATAL("base expansion's delete action has error.sleep 1 minute...");
+            OLAP_LOG_FATAL("base compaction's delete action has error.sleep 1 minute...");
             sleep(60);
         }
 
@@ -183,13 +183,13 @@ OLAPStatus BaseExpansionHandler::run() {
         return OLAP_ERR_BE_ERROR_DELETE_ACTION;
     }
 
-    _table->set_base_expansion_status(BASE_EXPANSION_WAITING, -1);
-    _release_base_expansion_lock();
+    _table->set_base_compaction_status(BASE_COMPACTION_WAITING, -1);
+    _release_base_compaction_lock();
 
     return OLAP_SUCCESS;
 }
 
-OLAPStatus BaseExpansionHandler::_exclude_not_expired_delete(
+OLAPStatus BaseCompaction::_exclude_not_expired_delete(
         const vector& need_merged_versions,
         vector* candidate_versions) {
     const int64_t delete_delta_expire_time = config::delete_delta_expire_time * 60;
@@ -225,7 +225,7 @@ static bool version_comparator(const Version& lhs, const Version& rhs) {
     return lhs.second < rhs.second;
 }
 
-bool BaseExpansionHandler::_check_whether_satisfy_policy(bool is_manual_trigger,
+bool BaseCompaction::_check_whether_satisfy_policy(bool is_manual_trigger,
                                                          vector* candidate_versions) {
     _obtain_header_rdlock();
     int32_t cumulative_layer_point = _table->cumulative_layer_point();
@@ -249,10 +249,10 @@ bool BaseExpansionHandler::_check_whether_satisfy_policy(bool is_manual_trigger,
         return  false;
     }
 
-    // be_layer_point应该为cumulative_layer_point之å‰ï¼Œå€’数第2个cumulative文件的end version
+    // base_compaction_layer_point应该为cumulative_layer_point之å‰ï¼Œå€’数第2个cumulative文件的end version
     int64_t base_creation_time = 0;
     size_t base_size = 0;
-    int32_t be_layer_point = -1;
+    int32_t base_compaction_layer_point = -1;
     for (unsigned int index = 0; index < path_versions.size(); ++index) {
         Version temp = path_versions[index];
         // base文件
@@ -264,15 +264,15 @@ bool BaseExpansionHandler::_check_whether_satisfy_policy(bool is_manual_trigger,
         }
 
         if (temp.second == cumulative_layer_point) {
-            be_layer_point = temp.first - 1;
+            base_compaction_layer_point = temp.first - 1;
             _latest_cumulative = temp;
-            _new_base_version = Version(0, be_layer_point);
+            _new_base_version = Version(0, base_compaction_layer_point);
         }
     }
 
     // åªæœ‰1个base文件和1个delta文件
-    if (be_layer_point == -1) {
-        OLAP_LOG_TRACE("can't do base expansion: no cumulative files. "
+    if (base_compaction_layer_point == -1) {
+        OLAP_LOG_TRACE("can't do base compaction: no cumulative files. "
                        "[table=%s; base_version=0-%d; cumulative_layer_point=%d]",
                        _table->full_name().c_str(),
                        _old_base_version.second,
@@ -283,8 +283,8 @@ bool BaseExpansionHandler::_check_whether_satisfy_policy(bool is_manual_trigger,
     }
 
     // åªæœ‰1个cumulative文件
-    if (be_layer_point == _old_base_version.second) {
-        OLAP_LOG_TRACE("can't do base expansion: only one cumulative file. "
+    if (base_compaction_layer_point == _old_base_version.second) {
+        OLAP_LOG_TRACE("can't do base compaction: only one cumulative file. "
                        "[table=%s; base_version=0-%d; cumulative_layer_point=%d]",
                        _table->full_name().c_str(),
                        _old_base_version.second,
@@ -306,10 +306,10 @@ bool BaseExpansionHandler::_check_whether_satisfy_policy(bool is_manual_trigger,
 
     std::sort(need_merged_versions.begin(), need_merged_versions.end(), version_comparator);
 
-    // 如果是手动执行START_BASE_EXPANSIONå‘½ä»¤ï¼Œåˆ™ä¸æ£€æŸ¥base expansion policy, 
-    // 也ä¸è€ƒè™‘删除版本过期问题,  åªè¦æœ‰å¯ä»¥åˆå¹¶çš„cumulative,就执行base expansion
+    // 如果是手动执行START_BASE_COMPACTIONå‘½ä»¤ï¼Œåˆ™ä¸æ£€æŸ¥base compaction policy, 
+    // 也ä¸è€ƒè™‘删除版本过期问题,  åªè¦æœ‰å¯ä»¥åˆå¹¶çš„cumulative,就执行base compaction
     if (is_manual_trigger) {
-        OLAP_LOG_TRACE("manual triggle base expansion. [table=%s]", _table->full_name().c_str());
+        OLAP_LOG_TRACE("manual trigger base compaction. [table=%s]", _table->full_name().c_str());
 
         *candidate_versions = need_merged_versions;
         _release_header_lock();
@@ -344,47 +344,48 @@ bool BaseExpansionHandler::_check_whether_satisfy_policy(bool is_manual_trigger,
 
     _release_header_lock();
 
-    // æ£€æŸ¥æ˜¯å¦æ»¡è¶³base expansionçš„è§¦å‘æ¡ä»¶
-    // 满足以下æ¡ä»¶æ—¶è§¦å‘base expansion: è§¦å‘æ¡ä»¶1 || è§¦å‘æ¡ä»¶2 || è§¦å‘æ¡ä»¶3
+    // æ£€æŸ¥æ˜¯å¦æ»¡è¶³base compactionçš„è§¦å‘æ¡ä»¶
+    // 满足以下æ¡ä»¶æ—¶è§¦å‘base compaction: è§¦å‘æ¡ä»¶1 || è§¦å‘æ¡ä»¶2 || è§¦å‘æ¡ä»¶3
     // è§¦å‘æ¡ä»¶1:cumulative文件个数超过一个阈值
-    const uint32_t be_policy_cumulative_files_number = config::be_policy_cumulative_files_number;
+    const uint32_t base_compaction_num_cumulative_deltas
+        = config::base_compaction_num_cumulative_deltas;
     // candidate_versions中包å«base文件,所以这里å‡1
-    if (candidate_versions->size() - 1 >= be_policy_cumulative_files_number) {
-        OLAP_LOG_INFO("satisfy the base expansion policy. [table=%s; "
-                      "cumualtive_files_number=%d; policy_cumulative_files_number=%d]",
+    if (candidate_versions->size() - 1 >= base_compaction_num_cumulative_deltas) {
+        OLAP_LOG_INFO("satisfy the base compaction policy. [table=%s; "
+                      "num_cumualtive_deltas=%d; base_compaction_num_cumulative_deltas=%d]",
                       _table->full_name().c_str(),
                       candidate_versions->size() - 1,
-                      be_policy_cumulative_files_number);
+                      base_compaction_num_cumulative_deltas);
         return true;
     }
 
     // è§¦å‘æ¡ä»¶2:所有cumulative文件的大å°è¶…过base文件大å°çš„æŸä¸€æ¯”ä¾‹
-    const double be_policy_cumulative_base_ratio = config::be_policy_cumulative_base_ratio;
+    const double base_cumulative_delta_ratio = config::base_cumulative_delta_ratio;
     double cumulative_base_ratio = static_cast(cumulative_total_size) / base_size;
-    if (cumulative_base_ratio > be_policy_cumulative_base_ratio) {
-        OLAP_LOG_INFO("satisfy the base expansion policy. [table=%s; cumualtive_total_size=%d; "
+    if (cumulative_base_ratio > base_cumulative_delta_ratio) {
+        OLAP_LOG_INFO("satisfy the base compaction policy. [table=%s; cumualtive_total_size=%d; "
                       "base_size=%d; cumulative_base_ratio=%f; policy_ratio=%f]",
                       _table->full_name().c_str(),
                       cumulative_total_size,
                       base_size,
                       cumulative_base_ratio,
-                      be_policy_cumulative_base_ratio);
+                      base_cumulative_delta_ratio);
         return true;
     }
 
-    // è§¦å‘æ¡ä»¶3:è·ç¦»ä¸Šä¸€æ¬¡è¿›è¡Œbase expansionå·²ç»è¶…过设定的间隔时间
-    const uint32_t be_policy_be_interval = config::be_policy_be_interval_seconds;
+    // è§¦å‘æ¡ä»¶3:è·ç¦»ä¸Šä¸€æ¬¡è¿›è¡Œbase compactionå·²ç»è¶…过设定的间隔时间
+    const uint32_t interval_since_laste_operation = config::base_compaction_interval_seconds_since_last_operation;
     int64_t interval_since_last_be = time(NULL) - base_creation_time;
-    if (interval_since_last_be > be_policy_be_interval) {
-        OLAP_LOG_INFO("satisfy the base expansion policy. [table=%s; "
+    if (interval_since_last_be > interval_since_laste_operation) {
+        OLAP_LOG_INFO("satisfy the base compaction policy. [table=%s; "
                       "interval_since_last_be=%ld; policy_interval=%ld]",
                       _table->full_name().c_str(),
-                      interval_since_last_be, be_policy_be_interval);
+                      interval_since_last_be, interval_since_laste_operation);
         return true;
     }
 
     OLAP_LOG_TRACE(
-            "don't satisfy the base expansion policy."
+            "don't satisfy the base compaction policy."
             "[cumulative_files_number=%d; cumulative_base_ratio=%f; interval_since_last_be=%ld]",
             candidate_versions->size() - 1,
             cumulative_base_ratio,
@@ -393,7 +394,7 @@ bool BaseExpansionHandler::_check_whether_satisfy_policy(bool is_manual_trigger,
     return false;
 }
 
-OLAPStatus BaseExpansionHandler::_do_base_expansion(VersionHash new_base_version_hash,
+OLAPStatus BaseCompaction::_do_base_compaction(VersionHash new_base_version_hash,
                                                     vector* base_data_sources,
                                                     vector* selectivities,
                                                     uint64_t* row_count) {
@@ -412,7 +413,7 @@ OLAPStatus BaseExpansionHandler::_do_base_expansion(VersionHash new_base_version
                   _table->full_name().c_str(),
                   _new_base_version.second);
 
-    // 2. 执行base expansion的merge
+    // 2. 执行base compaction的merge
     // 注æ„:无论是行列存,还是列存,在执行merge时都使用Merger类,ä¸èƒ½ä½¿ç”¨MassiveMerger。
     // 原因:MassiveMerger中的baseæ–‡ä»¶ä¸æ˜¯é€šè¿‡Reader读å–的,所以会导致删除æ¡ä»¶å¤±æ•ˆ,
     //       无法达到删除数æ®çš„目的
@@ -433,7 +434,7 @@ OLAPStatus BaseExpansionHandler::_do_base_expansion(VersionHash new_base_version
         }
         _table->release_header_lock();
 
-        Merger merger(_table, new_base, READER_BASE_EXPANSION);
+        Merger merger(_table, new_base, READER_BASE_COMPACTION);
         res = merger.merge(
                 *base_data_sources, use_simple_merge, &merged_rows, &filted_rows);
         if (res == OLAP_SUCCESS) {
@@ -499,7 +500,7 @@ OLAPStatus BaseExpansionHandler::_do_base_expansion(VersionHash new_base_version
     return OLAP_SUCCESS;
 }
 
-OLAPStatus BaseExpansionHandler::_update_header(const vector& selectivities,
+OLAPStatus BaseCompaction::_update_header(const vector& selectivities,
                                                 uint64_t row_count,
                                                 vector* unused_olap_indices) {
     vector unused_versions;
@@ -523,7 +524,7 @@ OLAPStatus BaseExpansionHandler::_update_header(const vector& selectiv
 
     OLAP_LOG_INFO("BE remove delete conditions. [removed_version=%d]", _new_base_version.second);
 
-    // Base Expansion完æˆä¹‹åŽï¼Œéœ€è¦åˆ é™¤header中版本å·å°äºŽç­‰äºŽæ–°base文件版本å·çš„删除æ¡ä»¶
+    // Base Compaction完æˆä¹‹åŽï¼Œéœ€è¦åˆ é™¤header中版本å·å°äºŽç­‰äºŽæ–°base文件版本å·çš„删除æ¡ä»¶
     DeleteConditionHandler cond_handler;
     cond_handler.delete_cond(_table, _new_base_version.second, true);
 
@@ -543,7 +544,7 @@ OLAPStatus BaseExpansionHandler::_update_header(const vector& selectiv
     return OLAP_SUCCESS;
 }
 
-void BaseExpansionHandler::_delete_old_files(vector* unused_indices) {
+void BaseCompaction::_delete_old_files(vector* unused_indices) {
     if (!unused_indices->empty()) {
         OLAPUnusedIndex* unused_index = OLAPUnusedIndex::get_instance();
 
@@ -554,7 +555,7 @@ void BaseExpansionHandler::_delete_old_files(vector* unused_indices)
     }
 }
 
-void BaseExpansionHandler::_cleanup() {
+void BaseCompaction::_cleanup() {
     // æ¸…ç†æŽ‰å·²ç”Ÿæˆçš„版本文件
     for (vector::iterator it = _new_olap_indices.begin();
             it != _new_olap_indices.end(); ++it) {
@@ -565,11 +566,11 @@ void BaseExpansionHandler::_cleanup() {
 
     // 释放打开的é”
     _release_header_lock();
-    _release_base_expansion_lock();
-    _table->set_base_expansion_status(BASE_EXPANSION_WAITING, -1);
+    _release_base_compaction_lock();
+    _table->set_base_compaction_status(BASE_COMPACTION_WAITING, -1);
 }
 
-bool BaseExpansionHandler::_validate_need_merged_versions(
+bool BaseCompaction::_validate_need_merged_versions(
         const vector& candidate_versions) {
     if (candidate_versions.size() <= 1) {
         OLAP_LOG_WARNING("unenough versions need to be merged. [size=%lu]",
@@ -607,7 +608,7 @@ bool BaseExpansionHandler::_validate_need_merged_versions(
     return true;
 }
 
-OLAPStatus BaseExpansionHandler::_validate_delete_file_action() {
+OLAPStatus BaseCompaction::_validate_delete_file_action() {
     // 1. acquire the latest version to make sure all is right after deleting files
     _obtain_header_rdlock();
     const FileVersionMessage* latest_version = _table->latest_version();
diff --git a/be/src/olap/base_expansion_handler.h b/be/src/olap/base_compaction.h
similarity index 81%
rename from be/src/olap/base_expansion_handler.h
rename to be/src/olap/base_compaction.h
index cf14a3c2ae..2734f70a68 100644
--- a/be/src/olap/base_expansion_handler.h
+++ b/be/src/olap/base_compaction.h
@@ -13,8 +13,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#ifndef BDG_PALO_BE_SRC_OLAP_BASE_EXPANSION_HANDLER_H
-#define BDG_PALO_BE_SRC_OLAP_BASE_EXPANSION_HANDLER_H
+#ifndef BDG_PALO_BE_SRC_OLAP_BASE_COMPACTION_H
+#define BDG_PALO_BE_SRC_OLAP_BASE_COMPACTION_H
 
 #include 
 #include 
@@ -28,27 +28,27 @@ namespace palo {
 
 class IData;
 
-// @brief 实现对START_BASE_EXPANSION命令的处ç†é€»è¾‘,并返回处ç†ç»“æžœ
-class BaseExpansionHandler {
+// @brief 实现对START_BASE_COMPACTION命令的处ç†é€»è¾‘,并返回处ç†ç»“æžœ
+class BaseCompaction {
 public:
-    BaseExpansionHandler() :
+    BaseCompaction() :
             _new_base_version(0, 0),
             _old_base_version(0, 0),
-            _base_expansion_locked(false),
+            _base_compaction_locked(false),
             _header_locked(false) {}
 
-    virtual ~BaseExpansionHandler() {
-        _release_base_expansion_lock();
+    virtual ~BaseCompaction() {
+        _release_base_compaction_lock();
     }
 
-    // åˆå§‹åŒ–BaseExpansion, 主è¦å®Œæˆä»¥ä¸‹å·¥ä½œï¼š
-    // 1. æ£€æŸ¥æ˜¯å¦æ»¡è¶³base expansionç­–ç•¥
+    // åˆå§‹åŒ–BaseCompaction, 主è¦å®Œæˆä»¥ä¸‹å·¥ä½œï¼š
+    // 1. æ£€æŸ¥æ˜¯å¦æ»¡è¶³base compactionç­–ç•¥
     // 2. 如果满足,计算需è¦åˆå¹¶å“ªäº›ç‰ˆæœ¬
     //
     // è¾“å…¥å‚æ•°ï¼š
     // - table: 待执行BE的OLAPTable的智能指针
     // - is_manual_trigger
-    //   - 如果为true,则是手动执行START_BASE_EXPANSION命令
+    //   - 如果为true,则是手动执行START_BASE_COMPACTION命令
     //   - 如果为false,则是根æ®BEç­–ç•¥æ¥æ‰§è¡Œ
     //
     // 返回值:
@@ -56,7 +56,7 @@ public:
     // - 其它情况下,返回相应的错误ç 
     OLAPStatus init(SmartOLAPTable table, bool is_manual_trigger);
 
-    // 执行BaseExpansion, å¯èƒ½ä¼šæŒç»­å¾ˆé•¿æ—¶é—´
+    // 执行BaseCompaction, å¯èƒ½ä¼šæŒç»­å¾ˆé•¿æ—¶é—´
     //
     // 返回值:
     // - 如果执行æˆåŠŸï¼Œåˆ™è¿”å›žOLAP_SUCCESSï¼›
@@ -68,10 +68,10 @@ private:
     OLAPStatus _exclude_not_expired_delete(const std::vector& need_merged_versions,
                                            std::vector* candidate_versions);
 
-    // æ£€éªŒå½“å‰æƒ…å†µæ˜¯å¦æ»¡è¶³base expansion的触å‘ç­–ç•¥
+    // æ£€éªŒå½“å‰æƒ…å†µæ˜¯å¦æ»¡è¶³base compaction的触å‘ç­–ç•¥
     //
     // è¾“å…¥å‚æ•°ï¼š
-    // - is_manual_trigger: æ˜¯å¦æ˜¯æ‰‹åŠ¨æ‰§è¡ŒSTART_BASE_EXPANSION命令
+    // - is_manual_trigger: æ˜¯å¦æ˜¯æ‰‹åŠ¨æ‰§è¡ŒSTART_BASE_COMPACTION命令
     // è¾“å‡ºå‚æ•°
     // - candidate_versions: BEå¯åˆå¹¶çš„cumulative文件
     //
@@ -92,7 +92,7 @@ private:
     // 返回值:
     // - 如果执行æˆåŠŸï¼Œåˆ™è¿”å›žOLAP_SUCCESSï¼›
     // - 其它情况下,返回相应的错误ç 
-    OLAPStatus _do_base_expansion(VersionHash new_base_version_hash,
+    OLAPStatus _do_base_compaction(VersionHash new_base_version_hash,
                                   std::vector* base_data_sources,
                                   std::vector* selectivities,
                                   uint64_t* row_count);
@@ -158,19 +158,19 @@ private:
         return left.second < right.second;
     }
 
-    bool _try_base_expansion_lock() {
-        if (_table->try_base_expansion_lock()) {
-            _base_expansion_locked = true;
+    bool _try_base_compaction_lock() {
+        if (_table->try_base_compaction_lock()) {
+            _base_compaction_locked = true;
             return true;
         }
 
         return false;
     }
 
-    void _release_base_expansion_lock() {
-        if (_base_expansion_locked) {
-            _table->release_base_expansion_lock();
-            _base_expansion_locked = false;
+    void _release_base_compaction_lock() {
+        if (_base_compaction_locked) {
+            _table->release_base_compaction_lock();
+            _base_compaction_locked = false;
         }
     }
 
@@ -199,17 +199,17 @@ private:
     Version _old_base_version;
     // çŽ°æœ‰çš„ç‰ˆæœ¬å·æœ€å¤§çš„cumulative
     Version _latest_cumulative;
-    // 在此次base expansion执行过程中,将被åˆå¹¶çš„cumulative文件版本
+    // 在此次base compaction执行过程中,将被åˆå¹¶çš„cumulative文件版本
     std::vector _need_merged_versions;
     // éœ€è¦æ–°å¢žçš„版本对应的OLAPIndex
     std::vector _new_olap_indices;
 
-    bool _base_expansion_locked;
+    bool _base_compaction_locked;
     bool _header_locked;
 
-    DISALLOW_COPY_AND_ASSIGN(BaseExpansionHandler);
+    DISALLOW_COPY_AND_ASSIGN(BaseCompaction);
 };
 
 }  // namespace palo
 
-#endif // BDG_PALO_BE_SRC_OLAP_BASE_EXPANSION_HANDLER_H
+#endif // BDG_PALO_BE_SRC_OLAP_BASE_COMPACTION_H
diff --git a/be/src/olap/column_file/bloom_filter.hpp b/be/src/olap/column_file/bloom_filter.hpp
index cc1971ae83..dd4f6798c3 100644
--- a/be/src/olap/column_file/bloom_filter.hpp
+++ b/be/src/olap/column_file/bloom_filter.hpp
@@ -29,7 +29,7 @@ namespace palo {
 namespace column_file {
 
 static const uint64_t DEFAULT_SEED = 104729;
-static const uint64_t BLOOM_FILTER_NULL_HASHCODE = 2862933555777941757L;
+static const uint64_t BLOOM_FILTER_NULL_HASHCODE = 2862933555777941757ULL;
 
 struct BloomFilterIndexHeader {
     uint64_t block_count;
@@ -72,12 +72,12 @@ public:
 
     // Set the bit specified by param, note that uint64_t type contains 2^6 bits
     void set(uint32_t index) {
-        _data[index >> 6] |= 1L << index;
+        _data[index >> 6] |= 1L << (index % 64);
     }
 
     // Return true if the bit specified by param is set
     bool get(uint32_t index) const {
-        return (_data[index >> 6] & (1L << index)) != 0;
+        return (_data[index >> 6] & (1L << (index % 64))) != 0;
     }
 
     // Merge with another BitSet by byte, return false when the length is not equal
@@ -175,7 +175,7 @@ public:
         uint32_t hash2 = (uint32_t) (hash >> 32);
 
         for (uint32_t i = 0; i < _hash_function_num; ++i) {
-            uint32_t combine_hash = hash1 + hash2 * i;
+            uint64_t combine_hash = hash1 + hash2 * i;
             uint32_t index = combine_hash % _bit_num;
             _bit_set.set(index);
         }
@@ -194,7 +194,7 @@ public:
         uint32_t hash2 = (uint32_t) (hash >> 32);
 
         for (uint32_t i = 0; i < _hash_function_num; ++i) {
-            uint32_t combine_hash = hash1 + hash2 * i;
+            uint64_t combine_hash = hash1 + hash2 * i;
             uint32_t index = combine_hash % _bit_num;
             if (!_bit_set.get(index)) {
                 return false;
diff --git a/be/src/olap/column_file/byte_buffer.cpp b/be/src/olap/column_file/byte_buffer.cpp
index 4404bb9807..8b352a7b85 100644
--- a/be/src/olap/column_file/byte_buffer.cpp
+++ b/be/src/olap/column_file/byte_buffer.cpp
@@ -157,34 +157,6 @@ ByteBuffer* ByteBuffer::mmap(FileHandler* handler, uint64_t offset, int prot, in
     return buf;
 }
 
-OLAPStatus ByteBuffer::set_position(uint64_t new_position) {
-    if (new_position <= _limit) {
-        _position = new_position;
-        return OLAP_SUCCESS;
-    } else {
-        return OLAP_ERR_INPUT_PARAMETER_ERROR;
-    }
-}
-
-OLAPStatus ByteBuffer::set_limit(uint64_t new_limit) {
-    if (new_limit > _capacity) {
-        return OLAP_ERR_INPUT_PARAMETER_ERROR;
-    }
-
-    _limit = new_limit;
-
-    if (_position > _limit) {
-        _position = _limit;
-    }
-
-    return OLAP_SUCCESS;
-}
-
-void ByteBuffer::flip() {
-    _limit = _position;
-    _position = 0;
-}
-
 OLAPStatus ByteBuffer::put(char src) {
     if (_position < _limit) {
         _array[_position++] = src;
diff --git a/be/src/olap/column_file/byte_buffer.h b/be/src/olap/column_file/byte_buffer.h
index c2abb8c4b3..2249a4aab0 100644
--- a/be/src/olap/column_file/byte_buffer.h
+++ b/be/src/olap/column_file/byte_buffer.h
@@ -81,7 +81,14 @@ public:
     }
     // 设置内部指针的ä½ç½®
     // 如果新ä½ç½®å¤§äºŽç­‰äºŽlimit, 则返回OLAP_ERR_INPUT_PARAMETER_ERROR
-    OLAPStatus set_position(uint64_t new_position);
+    OLAPStatus set_position(uint64_t new_position) {
+        if (new_position <= _limit) {
+            _position = new_position;
+            return OLAP_SUCCESS;
+        } else {
+            return OLAP_ERR_INPUT_PARAMETER_ERROR;
+        }
+    }
 
     inline uint64_t limit() const {
         return _limit;
@@ -89,9 +96,21 @@ public:
     //设置新的limit
     //如果limit超过capacity, 返回OLAP_ERR_INPUT_PARAMETER_ERROR
     //如果position大于新的limit, 设置position等于limit
-    OLAPStatus set_limit(uint64_t new_limit);
+    OLAPStatus set_limit(uint64_t new_limit) {
+        if (new_limit > _capacity) {
+            return OLAP_ERR_INPUT_PARAMETER_ERROR;
+        }
 
-    uint64_t remaining() const {
+        _limit = new_limit;
+
+        if (_position > _limit) {
+            _position = _limit;
+        }
+
+        return OLAP_SUCCESS;
+    }
+
+    inline uint64_t remaining() const {
         return _limit - _position;
     }
 
@@ -99,7 +118,10 @@ public:
     // 将position设置为0
     // 这个函数å¯ä»¥ç”¨äºŽå°†ByteBuffer从写状æ€è½¬ä¸ºè¯»çжæ€, å³åœ¨è¿›è¡Œä¸€äº›å†™ä¹‹åŽ
     // 调用本函数,之åŽå¯ä»¥å¯¹ByteBufferåšè¯»æ“作.
-    void flip();
+    void flip() {
+        _limit = _position;
+        _position = 0;
+    }
 
     // 以下三个读å–函数进行inline优化
 
diff --git a/be/src/olap/column_file/column_data.cpp b/be/src/olap/column_file/column_data.cpp
index 185b1adb61..c39f59ecec 100644
--- a/be/src/olap/column_file/column_data.cpp
+++ b/be/src/olap/column_file/column_data.cpp
@@ -25,172 +25,92 @@ namespace column_file {
 
 ColumnData::ColumnData(OLAPIndex* olap_index) : 
         IData(COLUMN_ORIENTED_FILE, olap_index),
-        _end_key(NULL),
-        _last_end_key(false),
         _is_using_cache(false),
-        _segment_reader(NULL),
-        _filted_rows(0),
-        _current_segment(0),
-        _row_block(NULL) {
+        _segment_reader(NULL) {
     _table = olap_index->table();
+    _num_rows_per_block = _table->num_rows_per_row_block();
 }
 
 ColumnData::~ColumnData() {
     _olap_index->release();
-    SAFE_DELETE(_end_key);
     SAFE_DELETE(_segment_reader);
-    SAFE_DELETE(_row_block);
 }
 
 OLAPStatus ColumnData::init() {
-    OLAPStatus res = OLAP_SUCCESS;
     _olap_index->acquire();
+    
+    auto res = _short_key_cursor.init(_olap_index->short_key_fields());
+    if (res != OLAP_SUCCESS) {
+        LOG(WARNING) << "key cursor init failed, table:" << _table->id()
+            << ", res:" << res;
+        return res;
+    }
     return res;
 }
 
-void ColumnData::set_conjuncts(std::vector* query_conjuncts, 
-                               std::vector* delete_conjuncts) {
-}
-
-const RowCursor* ColumnData::get_first_row() {
-    OLAPStatus res;
-
-    if (olap_index()->num_segments() == 0) {
-        set_eof(true);
-        return NULL;
-    }
-
-    RowBlockPosition block_pos;
-    block_pos.segment = 0u;
-    block_pos.data_offset = 0u;
-
-    if (OLAP_SUCCESS != (res = _seek_to_block(block_pos, false))) {
-        if (OLAP_ERR_DATA_EOF == res) {
-            OLAP_LOG_WARNING("stream EOF. "
-                    "[res=%d segment=%d block_size=%d data_offset=%d index_offset=%d]",
-                    res,
-                    block_pos.segment, block_pos.block_size,
-                    block_pos.data_offset, block_pos.index_offset);
-            set_eof(true);
-        } else {
-            OLAP_LOG_WARNING("fail to get row block. "
-                    "[res=%d segment=%d block_size=%d data_offset=%d index_offset=%d]",
-                    res,
-                    block_pos.segment, block_pos.block_size,
-                    block_pos.data_offset, block_pos.index_offset);
+OLAPStatus ColumnData::get_next_block(RowBlock** row_block) {
+    SCOPED_RAW_TIMER(&_stats->block_fetch_ns);
+    _is_normal_read = true;
+    auto res = _get_block(false);
+    if (res != OLAP_SUCCESS) {
+        if (res != OLAP_ERR_DATA_EOF) {
+            LOG(WARNING) << "Get next block failed.";
         }
-
-        return NULL;
-    }
-
-    return get_next_row();
-}
-
-const RowCursor* ColumnData::get_current_row() {
-    return _segment_reader->get_current_row();
-}
-
-const RowCursor* ColumnData::get_next_row() {
-    return _get_next_row(false);
-}
-
-const RowCursor* ColumnData::_get_next_row(bool without_filter) {
-    const RowCursor* cursor = _segment_reader->get_next_row(without_filter);
-
-    // fast path
-    // 查找end_keyæ—¶ï¼Œåªæ˜¯æ ¹æ®block的索引(å³ç¬¬ä¸€è¡Œçš„key)比较,
-    // end_keyå¯èƒ½å­˜åœ¨äºŽä¸Šä¸€ä¸ªblock中
-    if (OLAP_LIKELY(NULL != cursor)) {
-        if (OLAP_LIKELY(NULL == _end_key 
-                || _current_segment < _end_key_block_position.segment 
-                || (_current_segment == _end_key_block_position.segment 
-                && _segment_reader->current_block() < _end_key_block_position.data_offset))) {
-            return cursor;
-        } else {
-            int cmp = cursor->cmp(*_end_key);
-
-            if ((_last_end_key && cmp > 0) || (!_last_end_key && cmp >= 0)) {
-                set_eof(true);
-                return NULL;
-            } else {
-                return cursor;
-            }
-        }
-    } else {
-        if (_segment_reader->eof()) {
-            if (((_end_key != NULL) && 
-                    (_current_segment + 1 <= _end_key_block_position.segment)) || 
-                    ((_end_key == NULL) && 
-                    (_current_segment + 1 < _olap_index->num_segments()))) {
-                OLAPStatus res;
-                RowBlockPosition block_pos;
-                block_pos.segment = _current_segment + 1;
-                block_pos.data_offset = 0u;
-
-                res = _seek_to_block(block_pos, without_filter);
-                if (OLAP_SUCCESS != res) {
-                    if (OLAP_ERR_DATA_EOF == res) {
-                        OLAP_LOG_WARNING("stream EOF. "
-                                "[res=%d segment=%d block_size=%d data_offset=%d index_offset=%d]",
-                                res,
-                                block_pos.segment, block_pos.block_size,
-                                block_pos.data_offset, block_pos.index_offset);
-                        set_eof(true);
-                    } else {
-                        OLAP_LOG_WARNING("fail to get row block. "
-                                "[res=%d segment=%d block_size=%d data_offset=%d index_offset=%d]",
-                                res,
-                                block_pos.segment, block_pos.block_size,
-                                block_pos.data_offset, block_pos.index_offset);
-                    }
-                } else {
-                    // TODO:这里写了一个尾递归, 是å¯ä»¥ä¼˜åŒ–去掉递归的
-                    return _get_next_row(without_filter);
-                }
-            } else {
-                set_eof(true);
-            }
-        } else {
-            OLAP_LOG_WARNING("fail to reader segment.");
-        }
-    }
-
-    return NULL;
-}
-
-OLAPStatus ColumnData::_find_row_block(
-        const RowCursor& key,
-        bool find_last_key,
-        RowBlockPosition* block_pos) {
-    OLAPStatus res = OLAP_SUCCESS;
-    RowCursor helper_cursor;
-    
-    res = helper_cursor.init(olap_index()->short_key_fields());
-    if (OLAP_SUCCESS != res) {
-        OLAP_LOG_WARNING("init helper_cursor fail.");
+        *row_block = nullptr;
         return res;
     }
-
-    return olap_index()->find_short_key(key, &helper_cursor, find_last_key, block_pos);
+    *row_block = _read_block.get();
+    return OLAP_SUCCESS;
 }
 
-OLAPStatus ColumnData::_find_prev_row_block(RowBlockPosition* block_pos) {
-    RowBlockPosition current = *block_pos;
-    return olap_index()->find_prev_point(current, block_pos);
+OLAPStatus ColumnData::_next_row(const RowCursor** row, bool without_filter) {
+    _read_block->pos_inc();
+    do {
+        if (_read_block->has_remaining()) {
+            // 1. get one row for vectorized_row_batch
+            size_t pos = _read_block->pos();
+            _read_block->get_row(pos, &_cursor);
+            if (without_filter) {
+                *row = &_cursor;
+                return OLAP_SUCCESS;
+            }
+
+            // when without_filter is true, _include_blocks is nullptr
+            if (_read_block->block_status() == DEL_NOT_SATISFIED) {
+                *row = &_cursor;
+                return OLAP_SUCCESS;
+            } else {
+                DCHECK(_read_block->block_status() == DEL_PARTIAL_SATISFIED);
+                bool row_del_filter = _delete_handler.is_filter_data(
+                    _olap_index->version().second, _cursor);
+                if (!row_del_filter) {
+                    *row = &_cursor;
+                    return OLAP_SUCCESS;
+                }
+                // This row is filtered, continue to process next row
+                _stats->rows_del_filtered++;
+                _read_block->pos_inc();
+            }
+        } else {
+            // get_next_block
+            auto res = _get_block(without_filter);
+            if (res != OLAP_SUCCESS) {
+                return res;
+            }
+        }
+    } while (true);
+
+    return OLAP_SUCCESS;
 }
 
 OLAPStatus ColumnData::_seek_to_block(const RowBlockPosition& block_pos, bool without_filter) {
-    OLAPStatus res;
-    RuntimeProfile::Counter* read_data_timer = NULL;
-    if (_profile != NULL) {
-        read_data_timer = _profile->get_counter("ReadDataTime");    
-    }
-    SCOPED_TIMER(read_data_timer);
-
+    // TODO(zc): _segment_readers???
     // open segment reader if needed
-    if (NULL == _segment_reader || _current_segment != block_pos.segment) {
-        if (NULL != _segment_reader) {
-            add_filted_rows(_segment_reader->get_filted_rows());
+    if (_segment_reader == nullptr || block_pos.segment != _current_segment) {
+        if (block_pos.segment >= _olap_index->num_segments() ||
+            (_end_key_is_set && block_pos.segment > _end_segment)) {
+            _eof = true;
+            return OLAP_ERR_DATA_EOF;
         }
         SAFE_DELETE(_segment_reader);
         std::string file_name;
@@ -198,15 +118,16 @@ OLAPStatus ColumnData::_seek_to_block(const RowBlockPosition& block_pos, bool wi
                     olap_index()->version_hash(),
                     block_pos.segment);
         _segment_reader = new(std::nothrow) SegmentReader(
-                file_name, _table, olap_index(),  block_pos.segment, _return_columns,
-                _load_bf_columns, _conditions, _delete_handler, _delete_status,  _runtime_state);
-        if (NULL == _segment_reader) {
+                file_name, _table, olap_index(),  block_pos.segment,
+                _seek_columns, _load_bf_columns, _conditions,
+                _col_predicates, _delete_handler, _delete_status, _runtime_state, _stats);
+        if (_segment_reader == nullptr) {
             OLAP_LOG_WARNING("fail to malloc segment reader.");
             return OLAP_ERR_MALLOC_ERROR;
         }
 
-        _current_segment = block_pos.segment;
-        res = _segment_reader->init(_is_using_cache);
+        _current_segment = block_pos.segment; 
+        auto res = _segment_reader->init(_is_using_cache);
         if (OLAP_SUCCESS != res) {
             OLAP_LOG_WARNING("fail to init segment reader. [res=%d]", res);
             return res;
@@ -214,93 +135,84 @@ OLAPStatus ColumnData::_seek_to_block(const RowBlockPosition& block_pos, bool wi
     }
 
     uint32_t end_block;
-
-    if (NULL != _end_key && _end_key_block_position.segment == block_pos.segment) {
-        end_block = _end_key_block_position.data_offset;
+    if (_end_key_is_set && block_pos.segment == _end_segment) {
+        end_block = _end_block;
     } else {
         end_block = _segment_reader->block_count() - 1;
     }
 
     OLAP_LOG_DEBUG("###---### seek from %u to %u", block_pos.data_offset, end_block);
-    return _segment_reader->seek_to_block(block_pos.data_offset, end_block, without_filter);
+    return _segment_reader->seek_to_block(
+        block_pos.data_offset, end_block, without_filter, &_next_block, &_segment_eof);
 }
 
 OLAPStatus ColumnData::_find_position_by_short_key(
         const RowCursor& key, bool find_last_key, RowBlockPosition *position) {
-    OLAPStatus res = OLAP_SUCCESS;
-    
-    res = _find_row_block(key, find_last_key, position);
-    if (OLAP_ERR_INDEX_EOF == res) {
-        _eof = true;
-        return OLAP_SUCCESS;
-    } else if (OLAP_SUCCESS != res) {
-        OLAP_LOG_WARNING("find row block failed. [res=%d]", res);
+    RowBlockPosition tmp_pos;
+    auto res = _olap_index->find_short_key(key, &_short_key_cursor, find_last_key, &tmp_pos);
+    if (res != OLAP_SUCCESS) {
+        if (res == OLAP_ERR_INDEX_EOF) {
+            res = OLAP_ERR_DATA_EOF;
+        } else {
+            OLAP_LOG_WARNING("find row block failed. [res=%d]", res);
+        }
         return res;
     }
-
-    res = _find_prev_row_block(position);
-    if (OLAP_SUCCESS != res) {
+    res = olap_index()->find_prev_point(tmp_pos, position);
+    if (res != OLAP_SUCCESS) {
         OLAP_LOG_WARNING("find prev row block failed. [res=%d]", res);
         return res;
     }
-
     return OLAP_SUCCESS;
 }
 
 OLAPStatus ColumnData::_find_position_by_full_key(
         const RowCursor& key, bool find_last_key, RowBlockPosition *position) {
-    OLAPStatus res = OLAP_SUCCESS;
-    set_eof(false);
-    OlapStopWatch time_watch;
-
-    RowBlockPosition start_position;
-    res = _find_row_block(key, false, &start_position);
-
-    if (OLAP_ERR_INDEX_EOF == res) {
-        set_eof(true);
-        return OLAP_SUCCESS;
-    } else if (OLAP_SUCCESS != res) {
-        OLAP_LOG_WARNING("find row block failed. [res=%d]", res);
+    RowBlockPosition tmp_pos;
+    auto res = _olap_index->find_short_key(key, &_short_key_cursor, false, &tmp_pos);
+    if (res != OLAP_SUCCESS) {
+        if (res == OLAP_ERR_INDEX_EOF) {
+            res = OLAP_ERR_DATA_EOF;
+        } else {
+            OLAP_LOG_WARNING("find row block failed. [res=%d]", res);
+        }
         return res;
     }
-
-    res = _find_prev_row_block(&start_position);
-
-    if (OLAP_SUCCESS != res) {
+    RowBlockPosition start_position;
+    res = olap_index()->find_prev_point(tmp_pos, &start_position);
+    if (res != OLAP_SUCCESS) {
         OLAP_LOG_WARNING("find prev row block failed. [res=%d]", res);
         return res;
     }
 
     RowBlockPosition end_position;
-    res = _find_row_block(key, true, &end_position);
-    if (OLAP_ERR_INDEX_EOF == res) {
-        set_eof(true);
-        return res;
-    } else if (OLAP_SUCCESS != res) {
-        OLAP_LOG_WARNING("find row block failed. [res=%d]", res);
+    res = _olap_index->find_short_key(key, &_short_key_cursor, true, &end_position);
+    if (res != OLAP_SUCCESS) {
+        if (res == OLAP_ERR_INDEX_EOF) {
+            res = OLAP_ERR_DATA_EOF;
+        } else {
+            OLAP_LOG_WARNING("find row block failed. [res=%d]", res);
+        }
         return res;
     }
 
     // choose min value of end_position and m_end_key_block_position as real end_position
-    if (_end_key != NULL && end_position > _end_key_block_position) {
-        OLAPIndexOffset index_offset;
-        index_offset.segment = _end_key_block_position.segment;
-        index_offset.offset = _end_key_block_position.data_offset;
-        res = olap_index()->get_row_block_position(index_offset, &end_position);
-        if (res != OLAP_SUCCESS) {
-            OLAP_LOG_WARNING("fail to get row block position. [res=%d]", res);
-            return res;
+    if (_end_key_is_set) {
+        RowBlockPosition end_key_position;
+        end_key_position.segment = _end_segment;
+        end_key_position.data_offset = _end_block;
+        if (end_position > end_key_position) {
+            OLAPIndexOffset index_offset;
+            index_offset.segment = _end_segment;
+            index_offset.offset = _end_block;
+            res = olap_index()->get_row_block_position(index_offset, &end_position);
+            if (res != OLAP_SUCCESS) {
+                OLAP_LOG_WARNING("fail to get row block position. [res=%d]", res);
+                return res;
+            }
         }
     }
 
-    // helper rowcursor for OLAPDataComparator
-    RowCursor data_helper_cursor;
-    if (OLAP_SUCCESS != 
-            (res = data_helper_cursor.init(_table->tablet_schema()))) {
-        OLAP_LOG_FATAL("fail to init row cursor. [res=%d]", res);
-        return res;
-    }
-
     // ????end_position
     uint32_t distance = olap_index()->compute_distance(start_position, end_position);
 
@@ -308,10 +220,9 @@ OLAPStatus ColumnData::_find_position_by_full_key(
     BinarySearchIterator it_end(distance + 1);
     BinarySearchIterator it_result(0u);
     ColumnDataComparator comparator(
-            start_position, 
-            this, 
-            olap_index(), 
-            &data_helper_cursor);
+            start_position,
+            this,
+            olap_index());
     try {
         if (!find_last_key) {
             it_result = std::lower_bound(it_start, it_end, key, comparator);
@@ -337,136 +248,148 @@ OLAPStatus ColumnData::_find_position_by_full_key(
         return res;
     }
 
-    if (_end_key && start_position > _end_key_block_position) {
-        // 查找的是end key之外的数æ®
-        set_eof(true);
-        return OLAP_SUCCESS;
-    } else {
-        set_eof(false);
+    if (_end_key_is_set) {
+        RowBlockPosition end_key_position;
+        end_key_position.segment = _end_segment;
+        end_key_position.data_offset = _end_block;
+        if (end_position > end_key_position) {
+            return OLAP_ERR_DATA_EOF;
+        }
     }
 
     *position = start_position;
     return OLAP_SUCCESS;
 }
 
-const RowCursor* ColumnData::find_row(const RowCursor& key, bool find_last_key, bool is_end_key) {
-    OLAPStatus res = OLAP_SUCCESS;
+OLAPStatus ColumnData::_seek_to_row(const RowCursor& key, bool find_last_key, bool is_end_key) {
     RowBlockPosition position;
-    
-    _eof = false;
+    OLAPStatus res = OLAP_SUCCESS;
     FieldType type = _table->get_field_type_by_index(key.field_count() - 1);
     if (key.field_count() > _table->num_short_key_fields() || OLAP_FIELD_TYPE_VARCHAR == type) {
         res = _find_position_by_full_key(key, find_last_key, &position);
     } else {
         res = _find_position_by_short_key(key, find_last_key, &position);
     }
-
-    if (OLAP_SUCCESS != res) {
-        OLAP_LOG_WARNING("Fail to find the key.[res=%d key=%s find_last_key=%d]", 
-                res, key.to_string().c_str(), find_last_key);
-        return NULL;
-    } else if (_eof) {
-        OLAP_LOG_DEBUG("EOF when find the key.[res=%d key=%s find_last_key=%d]", 
-                res, key.to_string().c_str(), find_last_key);
-        return NULL;
+    if (res != OLAP_SUCCESS) {
+        if (res != OLAP_ERR_DATA_EOF) {
+            OLAP_LOG_WARNING("Fail to find the key.[res=%d key=%s find_last_key=%d]", 
+                             res, key.to_string().c_str(), find_last_key);
+        }
+        return res;
     }
-
     bool without_filter = is_end_key;
     res = _seek_to_block(position, without_filter);
-    if (OLAP_SUCCESS != res) {
-        if (OLAP_ERR_DATA_EOF == res) {
-            OLAP_LOG_WARNING("stream EOF. "
-                    "[res=%d segment=%d block_size=%d data_offset=%d index_offset=%d]",
-                    res,
-                    position.segment, position.block_size,
-                    position.data_offset, position.index_offset);
-            set_eof(true);
-        } else {
-            OLAP_LOG_WARNING("fail to get row block. "
-                    "[res=%d segment=%d block_size=%d data_offset=%d index_offset=%d]",
-                    res,
-                    position.segment, position.block_size,
-                    position.data_offset, position.index_offset);
+    if (res != OLAP_SUCCESS) {
+        OLAP_LOG_WARNING("fail to get row block. "
+                         "[res=%d segment=%d block_size=%d data_offset=%d index_offset=%d]",
+                         res,
+                         position.segment, position.block_size,
+                         position.data_offset, position.index_offset);
+        return res;
+    }
+    res = _get_block(without_filter);
+    if (res != OLAP_SUCCESS) {
+        if (res != OLAP_ERR_DATA_EOF) {
+            OLAP_LOG_WARNING("Fail to find the key.[res=%d key=%s find_last_key=%d]", 
+                             res, key.to_string().c_str(), find_last_key);
         }
-
-        return NULL;
+        return res;
     }
 
-    const RowCursor* row_cursor = NULL;
+    const RowCursor* row_cursor = _current_row();
     if (!find_last_key) {
         // 䏿‰¾last key。 那么应该返回大于等于这个key的第一个,也就是
         // row_cursor >= key
         // 此处比较2个blockçš„è¡Œæ•°ï¼Œæ˜¯å­˜åœ¨ä¸€ç§æžé™æƒ…况:若未找到满足的block,
         // Index模å—会返回倒数第二个block,此时keyå¯èƒ½æ˜¯æœ€åŽä¸€ä¸ªblock的最åŽä¸€è¡Œ
-        while (NULL != (row_cursor = _get_next_row(without_filter)) && !eof()
-                && row_cursor->cmp(key) < 0) {}
+        while (res == OLAP_SUCCESS && row_cursor->cmp(key) < 0) {
+            res = _next_row(&row_cursor, without_filter);
+        }
     } else {
         // 找last key。返回大于这个key的第一个。也就是
         // row_cursor > key
-        while (NULL != (row_cursor = _get_next_row(without_filter)) && !eof()
-                && row_cursor->cmp(key) <= 0) {}
+        while (res == OLAP_SUCCESS && row_cursor->cmp(key) <= 0) {
+            res = _next_row(&row_cursor, without_filter);
+        }
     }
 
-    return row_cursor;
+    return res;
 }
 
 const RowCursor* ColumnData::seek_and_get_current_row(const RowBlockPosition& position) {
-    OLAPStatus res = OLAP_SUCCESS;
-
-    res = _seek_to_block(position, true);
-    if (OLAP_SUCCESS != res) {
-        OLAP_LOG_WARNING("fail to seek block. [res=%d]", res);
-        if (OLAP_ERR_DATA_EOF == res) {
-            set_eof(true);
-        }
-        return NULL;
+    auto res = _seek_to_block(position, true);
+    if (res != OLAP_SUCCESS) {
+        LOG(WARNING) << "Fail to seek to block in seek_and_get_current_row, res=" << res
+            << ", segment:" << position.segment << ", block:" << position.data_offset;
+        return nullptr;
     }
-
-    const RowCursor* cursor = _segment_reader->get_next_row(true);
-    if (cursor != NULL) {
-        return cursor;
+    res = _get_block(true);
+    if (res != OLAP_SUCCESS) {
+        LOG(WARNING) << "Fail to get block in seek_and_get_current_row, res=" << res
+            << ", segment:" << position.segment << ", block:" << position.data_offset;
+        return nullptr;
     }
-
-    OLAP_LOG_WARNING("fail to read segment.");
-    return NULL;
+    return _current_row();
 }
 
-OLAPStatus ColumnData::set_end_key(const RowCursor* end_key, bool find_last_end_key) {
-    OLAPStatus res;
-
-    if (NULL == end_key) {
-        return OLAP_SUCCESS;
+OLAPStatus ColumnData::prepare_block_read(
+        const RowCursor* start_key, bool find_start_key,
+        const RowCursor* end_key, bool find_end_key,
+        RowBlock** first_block) {
+    SCOPED_RAW_TIMER(&_stats->block_fetch_ns);
+    set_eof(false);
+    _end_key_is_set = false;
+    _is_normal_read = false;
+    // set end position
+    if (end_key != nullptr) {
+        auto res = _seek_to_row(*end_key, find_end_key, true);
+        if (res == OLAP_SUCCESS) {
+            // we find a 
+            _end_segment = _current_segment;
+            _end_block = _current_block;
+            _end_row_index = _read_block->pos();
+            _end_key_is_set = true;
+        } else if (res != OLAP_ERR_DATA_EOF) {
+            LOG(WARNING) << "Find end key failed.key=" << end_key->to_string();
+            return res;
+        }
+        // res == OLAP_ERR_DATA_EOF means there is no end key, then we read to
+        // the end of this ColumnData
     }
+    set_eof(false);
+    if (start_key != nullptr) {
+        auto res = _seek_to_row(*start_key, find_start_key, false);
+        if (res == OLAP_SUCCESS) {
+            *first_block = _read_block.get();
+        } else if (res == OLAP_ERR_DATA_EOF) {
+            _eof = true;
+            *first_block = nullptr;
+            return res;
+        } else {
+            LOG(WARNING) << "start_key can't be found.key=" << start_key->to_string();
+            return res;
+        }
+    } else {
+        // This is used to 
+        _is_normal_read = true;
 
-    SAFE_DELETE(_end_key);
-
-    if (NULL == find_row(*end_key, find_last_end_key, true)) {
-        OLAP_LOG_DEBUG("end_key can't be found.[end_key=%s]", end_key->to_string().c_str());
-        return OLAP_SUCCESS;
+        RowBlockPosition pos;
+        pos.segment = 0u;
+        pos.data_offset = 0u;
+        auto res = _seek_to_block(pos, false);
+        if (res != OLAP_SUCCESS) {
+            LOG(WARNING) << "failed to seek to block in, res=" << res
+                << ", segment:" << pos.segment << ", block:" << pos.data_offset;
+            return res;
+        }
+        res = _get_block(false);
+        if (res != OLAP_SUCCESS) {
+            LOG(WARNING) << "failed to get block in , res=" << res
+                << ", segment:" << pos.segment << ", block:" << pos.data_offset;
+            return res;
+        }
+        *first_block = _read_block.get();
     }
-    
-    _end_key = new(std::nothrow) RowCursor();
-
-    if (NULL == _end_key) {
-        OLAP_LOG_WARNING("fail to malloc RowCursor.");
-        return OLAP_ERR_MALLOC_ERROR;
-    }
-
-    std::vector end_key_field_lengths;
-    end_key->get_field_buf_lengths(&end_key_field_lengths);
-    res = _end_key->init_keys(_table->tablet_schema(), end_key_field_lengths);
-
-    if (OLAP_SUCCESS != res) {
-        OLAP_LOG_WARNING("fail to init RowCursor. [res=%d]", res);
-        return res;
-    }
-
-    _end_key->copy(*end_key);
-    _last_end_key = find_last_end_key;
-
-    _end_key_block_position.segment = _current_segment;
-    _end_key_block_position.data_offset = _segment_reader->current_block();
-    
     return OLAP_SUCCESS;
 }
 
@@ -479,216 +402,235 @@ void ColumnData::set_read_params(
         const std::vector& return_columns,
         const std::set& load_bf_columns,
         const Conditions& conditions,
+        const std::vector& col_predicates,
         const std::vector& start_keys,
         const std::vector& end_keys,
         bool is_using_cache,
         RuntimeState* runtime_state) {
     _conditions = &conditions;
+    _col_predicates = &col_predicates;
+    _need_eval_predicates = !col_predicates.empty();
     _is_using_cache = is_using_cache;
     _runtime_state = runtime_state;
-    //_return_columns = return_columns;
+    _return_columns = return_columns;
     _load_bf_columns = load_bf_columns;
 
-    std::unordered_set column_set;
+    std::unordered_set column_set(_return_columns.begin(), _return_columns.end());
 
-    for (std::vector::const_iterator it = return_columns.begin();
-            it != return_columns.end(); ++it) {
-        column_set.insert(*it);
-    }
-
-    for (Conditions::CondColumns::const_iterator it = conditions.columns().begin();
-            it != conditions.columns().end(); ++it) {
-        column_set.insert(it->first);
+    for (auto& it : conditions.columns()) {
+        column_set.insert(it.first);
     }
 
     uint32_t max_key_column_count = 0;
-
-    for (std::vector::const_iterator it = start_keys.begin();
-            it != start_keys.end(); ++it) {
-        if ((*it)->field_count() > max_key_column_count) {
-            max_key_column_count = (*it)->field_count();
+    for (auto key : start_keys) {
+        if (key->field_count() > max_key_column_count) {
+            max_key_column_count = key->field_count();
         }
     }
 
-    for (std::vector::const_iterator it = end_keys.begin();
-            it != end_keys.end(); ++it) {
-        if ((*it)->field_count() > max_key_column_count) {
-            max_key_column_count = (*it)->field_count();
+    for (auto key : end_keys) {
+        if (key->field_count() > max_key_column_count) {
+            max_key_column_count = key->field_count();
         }
     }
 
     for (uint32_t i = 0; i < _table->tablet_schema().size(); i++) {
         if (i < max_key_column_count || column_set.find(i) != column_set.end()) {
-            _return_columns.push_back(i);
+            _seek_columns.push_back(i);
         }
     }
-}
 
-OLAPStatus ColumnData::_load_row_block() {
-    OLAPStatus res;
-
-    if (NULL == _row_block 
-            || _row_block->allocated_row_num() < _segment_reader->num_rows_in_block()) {
-        SAFE_DELETE(_row_block);
-        _row_block = new(std::nothrow) RowBlock(_table->tablet_schema());
-
-        if (NULL == _row_block) {
-            OLAP_LOG_WARNING("fail to allocate row block.");
-            return OLAP_ERR_MALLOC_ERROR;
-        }
-
-        RowBlockInfo info;
-        info.row_num = _segment_reader->num_rows_in_block();
-        info.data_file_type = DataFileType::COLUMN_ORIENTED_FILE;
-        info.null_supported = _olap_index->get_null_supported(0);  
-
-        res = _row_block->init(info);
-        if (OLAP_SUCCESS != res) {
-            OLAP_LOG_WARNING("fail to init row block. [res=%d]", res);
-            return res;
-        }
-    } else {
-        _row_block->clear();
+    auto res = _cursor.init(_table->tablet_schema());
+    if (res != OLAP_SUCCESS) {
+        OLAP_LOG_WARNING("fail to init row_cursor");
     }
 
-    uint32_t i = 0;
+    _read_vector_batch.reset(new VectorizedRowBatch(
+            _table->tablet_schema(), _return_columns, _num_rows_per_block));
 
-    do {
-        const RowCursor* cursor = get_next_row();
+    _seek_vector_batch.reset(new VectorizedRowBatch(
+            _table->tablet_schema(), _seek_columns, _num_rows_per_block));
 
-        if (eof()) {
-            break;
-        }
-
-        if (NULL == cursor) {
-            OLAP_LOG_WARNING("fail to get next row.");
-            return OLAP_ERR_COLUMN_DATA_LOAD_BLOCK;
-        }
-
-        res = _row_block->set_row(i, *cursor);
-        if (OLAP_SUCCESS != res) {
-            OLAP_LOG_WARNING("fail to set row. [res=%d]", res);
-            return OLAP_ERR_COLUMN_DATA_LOAD_BLOCK;
-        }
-
-        i++;
-    } while (i < _segment_reader->num_rows_in_block());
-
-    res = _row_block->finalize(i);
-    if (OLAP_SUCCESS != res) {
-        OLAP_LOG_WARNING("fail to finalize row block. [res=%d]", res);
-        return OLAP_ERR_COLUMN_DATA_LOAD_BLOCK;
-    }
-
-    return OLAP_SUCCESS;
-}
-
-OLAPStatus ColumnData::get_row_batch(
-        uint8_t* batch_buf,
-        uint32_t batch_buf_len,
-        uint32_t* start_row_index,
-        uint32_t* batch_row_num,
-        uint32_t* block_row_num,
-        std::vector& return_columns) {
-    return _segment_reader->get_row_batch(batch_buf, batch_buf_len, start_row_index,
-            batch_row_num, block_row_num, return_columns);
+    _read_block.reset(new RowBlock(_table->tablet_schema()));
+    RowBlockInfo block_info;
+    block_info.row_num = _num_rows_per_block;
+    block_info.null_supported = true;
+    _read_block->init(block_info);
 }
 
 OLAPStatus ColumnData::get_first_row_block(RowBlock** row_block) {
-    OLAPStatus res;
+    DCHECK(!_end_key_is_set) << "end key is set while use block interface.";
+    _is_normal_read = true;
+    _eof = false;
 
-    if (NULL == row_block) {
-        return OLAP_ERR_INPUT_PARAMETER_ERROR;
-    }
-
-    // 使用get_next_row_blockæ—¶ä¸åº”该使用end key, OLAPData里的实现å¯ä»¥è°ƒæ•´ä¸€ä¸‹
-    if (NULL != _end_key) {
-        OLAP_LOG_WARNING("end key is set while use block interface.");
-        return OLAP_ERR_INPUT_PARAMETER_ERROR;
-    }
-
-    // 这里继续使用olap_index是为了与OLAPData的行为(例如返回值)一致
-    res = olap_index()->find_first_row_block(&_row_block_pos);
-
-    if (OLAP_ERR_INDEX_EOF == res) {
-        *row_block = NULL;
-        set_eof(true);
+    auto res = _schema_change_init();
+    if (res != OLAP_SUCCESS) {
+        LOG(WARNING) << "failed to initial for schema change block read, res:" << res;
         return res;
-    } else if (OLAP_SUCCESS != res) {
+    }
+
+    // to be same with OLAPData, we use olap_index.
+    RowBlockPosition block_pos;
+    res = olap_index()->find_first_row_block(&block_pos);
+    if (res != OLAP_SUCCESS) {
+        if (res == OLAP_ERR_INDEX_EOF) {
+            *row_block = nullptr;
+            _eof = true;
+            return res;
+        }
         OLAP_LOG_WARNING("fail to find first row block with OLAPIndex.");
         return res;
     }
 
-    res = _seek_to_block(_row_block_pos, false);
-    if (OLAP_SUCCESS != res) {
-        OLAP_LOG_WARNING("seek to block fail. [res=%d]", res);
-        return res;
-    }
-
-    res = _load_row_block();
-    if (OLAP_SUCCESS != res) {
-        OLAP_LOG_WARNING("fail to load data to row block. [res=%d]", res);
-
-        if (OLAP_ERR_DATA_EOF == res) {
-            *row_block = NULL;
-            set_eof(true);
+    res = _seek_to_block(block_pos, false);
+    if (res != OLAP_SUCCESS) {
+        if (res != OLAP_ERR_DATA_EOF) {
+            OLAP_LOG_WARNING("seek to block fail. [res=%d]", res);
         }
-
+        *row_block = nullptr;
         return res;
     }
 
-    *row_block = _row_block;
+    res = _get_block(false);
+    if (res != OLAP_SUCCESS) {
+        if (res != OLAP_ERR_DATA_EOF) {
+            OLAP_LOG_WARNING("fail to load data to row block. [res=%d]", res);
+        }
+        *row_block = nullptr;
+        return res;
+    }
+
+    *row_block = _read_block.get();
     return OLAP_SUCCESS;
 }
 
 OLAPStatus ColumnData::get_next_row_block(RowBlock** row_block) {
-    if (NULL == row_block) {
-        return OLAP_ERR_INPUT_PARAMETER_ERROR;
-    }
-
-    if (NULL == _row_block) {
-        OLAP_LOG_WARNING("row block is not initialized.");
-        return OLAP_ERR_INIT_FAILED;
-    }
-
-    if (eof()) {
-        *row_block = NULL;
-        return OLAP_ERR_DATA_EOF;
-    }
-
-    OLAPStatus res = _load_row_block();
-    if (OLAP_SUCCESS != res) {
-        OLAP_LOG_WARNING("fail to load data to row block. [res=%d]", res);
-
-        if (OLAP_ERR_DATA_EOF == res) {
-            *row_block = NULL;
-            set_eof(true);
+    _is_normal_read = true;
+    OLAPStatus res = _get_block(false);
+    if (res != OLAP_SUCCESS) {
+        if (res != OLAP_ERR_DATA_EOF) {
+            OLAP_LOG_WARNING("fail to load data to row block. [res=%d]", res);
         }
-
+        *row_block = nullptr;
         return res;
     }
 
-    *row_block = _row_block;
+    *row_block = _read_block.get();
     return OLAP_SUCCESS;
 }
 
-OLAPStatus ColumnData::pickle() {
-    return OLAP_SUCCESS;
-}
+OLAPStatus ColumnData::_schema_change_init() {
+    _is_using_cache = false;
 
-OLAPStatus ColumnData::unpickle() {
-    return OLAP_SUCCESS;
-}
-
-void ColumnData::add_filted_rows(uint64_t filted_rows) {
-    _filted_rows += filted_rows;
-}
-
-uint64_t ColumnData::get_filted_rows(){
-    if (NULL != _segment_reader) {
-        _filted_rows += _segment_reader->get_filted_rows();
+    for (int i = 0; i < _table->tablet_schema().size(); ++i) {
+        _return_columns.push_back(i);
+        _seek_columns.push_back(i);
     }
-    return _filted_rows;
+
+    auto res = _cursor.init(_table->tablet_schema());
+    if (res != OLAP_SUCCESS) {
+        OLAP_LOG_WARNING("fail to init row_cursor");
+        return res;
+    }
+
+    _read_vector_batch.reset(new VectorizedRowBatch(
+            _table->tablet_schema(), _return_columns, _num_rows_per_block));
+
+    _read_block.reset(new RowBlock(_table->tablet_schema()));
+
+    RowBlockInfo block_info;
+    block_info.row_num = _num_rows_per_block;
+    block_info.null_supported = true;
+    _read_block->init(block_info);
+    return OLAP_SUCCESS;
+}
+
+OLAPStatus ColumnData::_get_block_from_reader(
+        VectorizedRowBatch** got_batch, bool without_filter) {
+    VectorizedRowBatch* vec_batch = nullptr;
+    if (_is_normal_read) {
+        vec_batch = _read_vector_batch.get();
+    } else {
+        vec_batch = _seek_vector_batch.get();
+    }
+    // If this is normal read
+    do {
+#if 0
+        LOG(INFO) << "_current_segment is " << _current_segment
+            << ", _next_block:" << _next_block
+            << ", _end_segment::"  << _end_segment
+            << ", _end_block:" << _end_block
+            << ", _end_row_index:" << _end_row_index
+            << ", _segment_eof:" << _segment_eof;
+#endif
+        vec_batch->clear();
+        // If we are going to read last block, we need to set batch limit to the end of key
+        // if without_filter is true and _end_key_is_set is true, this must seek to start row's
+        // block, we must load the entire block.
+        if (OLAP_UNLIKELY(!without_filter &&
+                          _end_key_is_set &&
+                          _next_block == _end_block &&
+                          _current_segment == _end_segment)) {
+            vec_batch->set_limit(_end_row_index);
+            if (_end_row_index == 0) {
+                _segment_eof = true;
+            }
+        }
+
+        if (!_segment_eof) {
+            _current_block = _next_block;
+            auto res = _segment_reader->get_block(vec_batch, &_next_block, &_segment_eof);
+            if (res != OLAP_SUCCESS) {
+                return res;
+            }
+            // Normal case
+            *got_batch = vec_batch;
+            return OLAP_SUCCESS;
+        }
+        // When this segment is read over, we reach here.
+        // Seek to next segment
+        RowBlockPosition block_pos;
+        block_pos.segment = _current_segment + 1;
+        block_pos.data_offset = 0;
+        auto res = _seek_to_block(block_pos, without_filter);
+        if (res != OLAP_SUCCESS) {
+            return res;
+        }
+    } while (true);
+
+    return OLAP_SUCCESS;
+}
+
+OLAPStatus ColumnData::_get_block(bool without_filter) {
+    do {
+        VectorizedRowBatch* vec_batch = nullptr;
+        auto res = _get_block_from_reader(&vec_batch, without_filter);
+        if (res != OLAP_SUCCESS) {
+            return res;
+        }
+        // evaluate predicates
+        if (!without_filter && _need_eval_predicates) {
+            SCOPED_RAW_TIMER(&_stats->vec_cond_ns);
+            size_t old_size = vec_batch->size();
+            for (auto pred : *_col_predicates) {
+                pred->evaluate(vec_batch);
+            }
+            _stats->rows_vec_cond_filtered += old_size - vec_batch->size();
+        }
+        // if vector is empty after predicate evaluate, get next block
+        if (vec_batch->size() == 0) {
+            continue;
+        }
+        // when reach here, we have already read a block successfully
+        _read_block->clear();
+        vec_batch->dump_to_row_block(_read_block.get());
+        return OLAP_SUCCESS;
+    } while (true);
+    return OLAP_SUCCESS;
+}
+
+uint64_t ColumnData::get_filted_rows() {
+    return _stats->rows_del_filtered;
 }
 
 }  // namespace column_file
diff --git a/be/src/olap/column_file/column_data.h b/be/src/olap/column_file/column_data.h
index 9689218870..824355b86e 100644
--- a/be/src/olap/column_file/column_data.h
+++ b/be/src/olap/column_file/column_data.h
@@ -30,26 +30,26 @@ namespace column_file {
 
 class SegmentReader;
 
-// å®šä¹‰æ–‡ä»¶çš„è¯»å–æŽ¥å£, 接å£å®šä¹‰è§IData的定义
+// This class is column data reader. this class will be used in two case.
 class ColumnData : public IData {
 public:
     explicit ColumnData(OLAPIndex* olap_index);
     virtual ~ColumnData();
 
     virtual OLAPStatus init();
-    virtual void set_conjuncts(std::vector* query_conjuncts, 
-                               std::vector* delete_conjuncts);
-    virtual const RowCursor* get_first_row();
-    virtual const RowCursor* get_current_row();
-    virtual const RowCursor* get_next_row();
-    virtual const RowCursor* find_row(const RowCursor& key, bool find_last_key, bool is_end_key);
 
-    virtual OLAPStatus set_end_key(const RowCursor* end_key, bool find_last_end_key);
+    OLAPStatus prepare_block_read(
+            const RowCursor* start_key, bool find_start_key,
+            const RowCursor* end_key, bool find_end_key,
+            RowBlock** first_block) override;
+
+    OLAPStatus get_next_block(RowBlock** row_block) override;
 
     virtual void set_read_params(
             const std::vector& return_columns,
             const std::set& load_bf_columns,
             const Conditions& conditions,
+            const std::vector& col_predicates,
             const std::vector& start_keys,
             const std::vector& end_keys,
             bool is_using_cache,
@@ -58,66 +58,96 @@ public:
     virtual OLAPStatus get_first_row_block(RowBlock** row_block);
     virtual OLAPStatus get_next_row_block(RowBlock** row_block);
 
-    virtual OLAPStatus get_row_batch(
-            uint8_t* batch_buf,
-            uint32_t batch_buf_len,
-            uint32_t* start_row_index,
-            uint32_t* batch_row_num,
-            uint32_t* block_row_num,
-            std::vector& return_columns);
-
-    virtual OLAPStatus pickle();
-    virtual OLAPStatus unpickle();
+    OLAPStatus pickle() override { return OLAP_SUCCESS; }
+    OLAPStatus unpickle() override { return OLAP_SUCCESS; }
 
+    // Only used to binary search in full-key find row
     const RowCursor* seek_and_get_current_row(const RowBlockPosition& position);
 
     virtual uint64_t get_filted_rows();
 
-    void add_filted_rows(uint64_t filted_rows);
-
 private:
     DISALLOW_COPY_AND_ASSIGN(ColumnData);
-    OLAPStatus _find_row_block(const RowCursor& key,
-            bool find_last_key,
-            RowBlockPosition* block_pos);
-    OLAPStatus _find_prev_row_block(RowBlockPosition* block_pos);
-    OLAPStatus _seek_to_block(const RowBlockPosition &block_pos, bool without_filter);
-    OLAPStatus _load_row_block();
 
-    const RowCursor* _get_next_row(bool without_filter);
+    // To compatable with schmea change read, use this function to init column data
+    // for schema change read. Only called in get_first_row_block
+    OLAPStatus _schema_change_init();
+
+    // Try to seek to 'key'. If this funciton returned with OLAP_SUCCESS, current_row()
+    // point to the first row meet the requirement.
+    // If there is no such row, OLAP_ERR_DATA_EOF will return.
+    // If error happend, other code will return
+    OLAPStatus _seek_to_row(const RowCursor& key, bool find_key, bool is_end_key);
+
+    // seek to block_pos without load that block, caller must call _get_block()
+    // to load _read_block with data. If without_filter is false, this will seek to
+    // other block. Because the seeked block may be filtered by condition or delete.
+    OLAPStatus _seek_to_block(const RowBlockPosition &block_pos, bool without_filter);
+
     OLAPStatus _find_position_by_short_key(
             const RowCursor& key, bool find_last_key, RowBlockPosition *position);
     OLAPStatus _find_position_by_full_key(
             const RowCursor& key, bool find_last_key, RowBlockPosition *position);
 
+    // Used in _seek_to_row, this function will goto next row that vaild for this
+    // ColumnData
+    OLAPStatus _next_row(const RowCursor** row, bool without_filter);
+
+    // get block from reader, just read vector batch from _current_segment.
+    // The read batch return by got_batch.
+    OLAPStatus _get_block_from_reader(
+        VectorizedRowBatch** got_batch, bool without_filter);
+
+    // get block from segment reader. If this function returns OLAP_SUCCESS
+    OLAPStatus _get_block(bool without_filter);
+
+    const RowCursor* _current_row() {
+        _read_block->get_row(_read_block->pos(), &_cursor);
+        return &_cursor;
+    }
 private:
     OLAPTable* _table;
-    RowCursor* _end_key;                  // éžNULL表示设置了end key
-    bool _last_end_key;
+    // whether in normal read, use return columns to load block
+    bool _is_normal_read = false;
+    bool _end_key_is_set = false;
     bool _is_using_cache;
-    RowBlockPosition _end_key_block_position;
+    bool _segment_eof = false;
+    bool _need_eval_predicates = false;
+
     std::vector _return_columns;
+    std::vector _seek_columns;
     std::set _load_bf_columns;
     
     SegmentReader* _segment_reader;
-    uint64_t _filted_rows;
+
+    std::unique_ptr _seek_vector_batch;
+    std::unique_ptr _read_vector_batch;
+
+    std::unique_ptr _read_block = nullptr;
+    RowCursor _cursor;
+    RowCursor _short_key_cursor;
+
+    // Record when last key is found
+    uint32_t _current_block = 0;
     uint32_t _current_segment;
-    // 下é¢ä¸¤ä¸ªæˆå‘˜åªç”¨äºŽblock接å£
-    RowBlock* _row_block;                 // 用于get_first_row_block缓存数æ®
-    RowBlockPosition _row_block_pos;      // 与_row_block对应的pos
+    uint32_t _next_block;
+
+    uint32_t _end_segment;
+    uint32_t _end_block;
+    int64_t _end_row_index = 0;
+
+    size_t _num_rows_per_block;
 };
 
 class ColumnDataComparator {
 public:
     ColumnDataComparator(
-            RowBlockPosition position,
-            ColumnData* olap_data,
-            const OLAPIndex* index,
-            RowCursor* helper_cursor) : 
-            _start_block_position(position),
+        RowBlockPosition position,
+        ColumnData* olap_data,
+        const OLAPIndex* index)
+            : _start_block_position(position),
             _olap_data(olap_data),
-            _index(index),
-            _helper_cursor(helper_cursor) {}
+            _index(index) {}
 
     ~ColumnDataComparator() {}
 
@@ -142,7 +172,7 @@ private:
             throw ComparatorException();
         }
         const RowCursor* helper_cursor = _olap_data->seek_and_get_current_row(position);
-        if (NULL == helper_cursor) {
+        if (helper_cursor == nullptr) {
             OLAP_LOG_WARNING("fail to seek and get current row.");
             throw ComparatorException();
         }
@@ -157,7 +187,6 @@ private:
     const RowBlockPosition _start_block_position;
     ColumnData* _olap_data;
     const OLAPIndex* _index;
-    RowCursor* _helper_cursor;
 };
 
 }  // namespace column_file
diff --git a/be/src/olap/column_file/column_reader.cpp b/be/src/olap/column_file/column_reader.cpp
index c12ad80825..1526d3c8ce 100644
--- a/be/src/olap/column_file/column_reader.cpp
+++ b/be/src/olap/column_file/column_reader.cpp
@@ -20,7 +20,6 @@
 #include "olap/column_file/file_stream.h"
 #include "olap/olap_define.h"
 
-
 namespace palo {
 namespace column_file {
 IntegerColumnReader::IntegerColumnReader(uint32_t column_unique_id): 
@@ -78,6 +77,7 @@ StringColumnDirectReader::StringColumnDirectReader(
         uint32_t dictionary_size) : 
         _eof(false),
         _column_unique_id(column_unique_id),
+        _values(NULL),
         _data_stream(NULL),
         _length_reader(NULL) {
 }
@@ -86,7 +86,9 @@ StringColumnDirectReader::~StringColumnDirectReader() {
     SAFE_DELETE(_length_reader);
 }
 
-OLAPStatus StringColumnDirectReader::init(std::map* streams) {
+OLAPStatus StringColumnDirectReader::init(
+        std::map* streams,
+        int size, MemPool* mem_pool) {
     if (NULL == streams) {
         OLAP_LOG_WARNING("input streams is NULL");
         return OLAP_ERR_INPUT_PARAMETER_ERROR;
@@ -103,6 +105,8 @@ OLAPStatus StringColumnDirectReader::init(std::map(mem_pool->allocate(size * sizeof(StringSlice)));
+
     ReadOnlyFileStream* length_stream = extract_stream(_column_unique_id,
                                         StreamInfoMessage::LENGTH,
                                         streams);
@@ -173,12 +177,94 @@ OLAPStatus StringColumnDirectReader::next(char* buffer, uint32_t* length) {
     return res;
 }
 
+OLAPStatus StringColumnDirectReader::next_vector(
+            ColumnVector* column_vector,
+            uint32_t size,
+            MemPool* mem_pool,
+            int64_t* read_bytes) {
+    /*
+     * MemPool here is not the same as MemPool in init function
+     * 1. MemPool is created by VectorizedRowBatch,
+     *    and reset when load row batch
+     * 2. MemPool in init function is created by SegmentReader,
+     *    and free by SegmentReader deconstructor. 
+     */
+    OLAPStatus res = OLAP_SUCCESS;
+    int64_t length = 0;
+    int64_t string_buffer_size = 0;
+
+    column_vector->set_col_data(_values);
+    if (column_vector->no_nulls()) {
+        for (int i = 0; i < size; ++i) {
+            res = _length_reader->next(&length);
+            if (OLAP_SUCCESS != res) {
+                return res;
+            }
+            _values[i].size = length;
+            string_buffer_size += length;
+        }
+
+        char* string_buffer = reinterpret_cast(mem_pool->allocate(string_buffer_size));
+        for (int i = 0; i < size; ++i) {
+            _values[i].data = string_buffer;
+            length = _values[i].size;
+            while (length > 0) {
+                uint64_t buf_size = length;
+                res = _data_stream->read(string_buffer, &buf_size);
+                if (res != OLAP_SUCCESS) {
+                    return res;
+                }
+                length -= buf_size;
+                string_buffer += buf_size;
+            }
+        }
+    } else {
+        bool* is_null = column_vector->is_null();
+        for (int i = 0; i < size; ++i) {
+            if (!is_null[i]) {
+                res = _length_reader->next(&length);
+                if (OLAP_SUCCESS != res) {
+                    return res;
+                }
+                _values[i].size = length;
+                string_buffer_size += length;
+            } else {
+                _values[i].size = 0;
+            }
+        }
+
+        char* string_buffer = reinterpret_cast(mem_pool->allocate(string_buffer_size));
+        for (int i = 0; i < size; ++i) {
+            if (!is_null[i]) {
+                length = _values[i].size;
+                _values[i].data = string_buffer;
+                while (length > 0) {
+                    uint64_t buf_size = length;
+                    res = _data_stream->read(string_buffer, &buf_size);
+                    if (res != OLAP_SUCCESS) {
+                        return res;
+                    }
+                    length -= buf_size;
+                    string_buffer += buf_size;
+                }
+            } else {
+                _values[i].data = nullptr;
+                _values[i].size = 0;
+            }
+        }
+    }
+    *read_bytes += string_buffer_size;
+
+    return res;
+}
+
 StringColumnDictionaryReader::StringColumnDictionaryReader(
         uint32_t column_unique_id,
-        uint32_t dictionary_size) : 
+        uint32_t dictionary_size) :
         _eof(false),
         _dictionary_size(dictionary_size),
         _column_unique_id(column_unique_id),
+        _values(NULL),
         //_dictionary_size(0),
         //_offset_dictionary(NULL),
         //_dictionary_data_buffer(NULL),
@@ -200,7 +286,8 @@ StringColumnDictionaryReader::~StringColumnDictionaryReader() {
 // åŽè¾¹å†æµ‹ï¼Œå…ˆä¿ç•™ä»£ç 
 
 OLAPStatus StringColumnDictionaryReader::init(std::map *streams,
-                                          UniqueIdEncodingMap* encodings) {
+                                          UniqueIdEncodingMap* encodings,
+                                          RuntimeProfile* profile) {
     ReadOnlyFileStream* dictionary_data_stream = extract_stream(_column_unique_id,
                                                       StreamInfoMessage::DICTIONARY_DATA,
                                                       streams);
@@ -284,7 +371,9 @@ OLAPStatus StringColumnDictionaryReader::init(std::map* streams) {
+OLAPStatus StringColumnDictionaryReader::init(
+        std::map* streams,
+        int size, MemPool* mem_pool) {
     ReadOnlyFileStream* dictionary_data_stream = extract_stream(_column_unique_id,
             StreamInfoMessage::DICTIONARY_DATA,
             streams);
@@ -345,6 +434,8 @@ OLAPStatus StringColumnDictionaryReader::init(std::map(mem_pool->allocate(size * sizeof(StringSlice)));
     int64_t read_buffer_size = 1024;
     char* _read_buffer = new(std::nothrow) char[read_buffer_size];
 
@@ -442,8 +533,79 @@ OLAPStatus StringColumnDictionaryReader::next(char* buffer, uint32_t* length) {
     return OLAP_SUCCESS;
 }
 
+OLAPStatus StringColumnDictionaryReader::next_vector(
+            ColumnVector* column_vector,
+            uint32_t size,
+            MemPool* mem_pool,
+            int64_t* read_bytes) {
+
+    int64_t index[size];
+    int64_t buffer_size = 0;
+    OLAPStatus res = OLAP_SUCCESS;
+
+    column_vector->set_col_data(_values);
+    if (column_vector->no_nulls()) {
+        for (int i = 0; i < size; ++i) {
+            res = _data_reader->next(&index[i]);
+            if (OLAP_SUCCESS != res) {
+                return res;
+            }
+            if (index[i] >= static_cast(_dictionary.size())) {
+                OLAP_LOG_WARNING("value may indicated an invalid dictionary entry. "
+                                 "[index = %lu, dictionary_size = %lu]",
+                                 index[i], _dictionary.size());
+                return OLAP_ERR_BUFFER_OVERFLOW;
+            }
+            _values[i].size = _dictionary[index[i]].size();
+            buffer_size += _values[i].size;
+        }
+
+        char* string_buffer = reinterpret_cast(mem_pool->allocate(buffer_size));
+        for (int i = 0; i < size; ++i) {
+            memory_copy(string_buffer,
+                        _dictionary[index[i]].c_str(),
+                        _values[i].size);
+            _values[i].data = string_buffer;
+            string_buffer += _values[i].size;
+        }
+    } else {
+        bool* is_null = column_vector->is_null();
+        for (int i = 0; i < size; ++i) {
+            if (!is_null[i]) {
+                res = _data_reader->next(&index[i]);
+                if (OLAP_SUCCESS != res) {
+                    return res;
+                }
+                if (index[i] >= static_cast(_dictionary.size())) {
+                    OLAP_LOG_WARNING("value may indicated an invalid dictionary entry. "
+                                     "[index = %lu, dictionary_size = %lu]",
+                                     index[i], _dictionary.size());
+                    return OLAP_ERR_BUFFER_OVERFLOW;
+                }
+                _values[i].size = _dictionary[index[i]].size();
+                buffer_size += _values[i].size;
+            }
+        }
+
+        char* string_buffer = reinterpret_cast(mem_pool->allocate(buffer_size));
+        for (int i = 0; i < size; ++i) {
+            if (!is_null[i]) {
+                memory_copy(string_buffer,
+                            _dictionary[index[i]].c_str(),
+                            _values[i].size);
+                _values[i].data = string_buffer;
+                string_buffer += _values[i].size;
+            }
+        }
+    }
+    *read_bytes += buffer_size;
+
+    return res;
+}
+
 ColumnReader::ColumnReader(uint32_t column_id, uint32_t column_unique_id) : 
         _value_present(false),
+        _is_null(NULL),
         _column_id(column_id),
         _column_unique_id(column_unique_id),
         _present_reader(NULL) {
@@ -475,7 +637,7 @@ ColumnReader* ColumnReader::create(uint32_t column_id,
                 return new(std::nothrow) NullValueReader(column_id, column_unique_id);
             } else {
                 return new(std::nothrow) DefaultValueReader(column_id, column_unique_id,
-                        field_info.default_value);
+                        field_info.default_value, field_info.type, field_info.length);
             }
         } else if (field_info.is_allow_null) {
             OLAP_LOG_DEBUG("create NullValueReader: %s", field_info.name.c_str());
@@ -643,17 +805,23 @@ ColumnReader::~ColumnReader() {
     SAFE_DELETE(_present_reader);
 }
 
-OLAPStatus ColumnReader::init(std::map* streams) {
+OLAPStatus ColumnReader::init(
+        std::map* streams,
+        int size, MemPool* mem_pool,
+        OlapReaderStatistics* stats) {
     if (NULL == streams) {
         OLAP_LOG_WARNING("null parameters given.");
         return OLAP_ERR_INPUT_PARAMETER_ERROR;
     }
+    _stats = stats;
 
     // 从map中找到需è¦çš„æµï¼ŒColumnReader的数æ®åº”该由一æ¡PRESENTæµå’Œä¸€æ¡ROW_INDEXæµç»„æˆ
     ReadOnlyFileStream* present_stream = extract_stream(_column_unique_id,
                                          StreamInfoMessage::PRESENT,
                                          streams);
 
+    _is_null = reinterpret_cast(mem_pool->allocate(size));
+    memset(_is_null, 0, size);
 
     if (NULL == present_stream) {
         _present_reader = NULL;
@@ -690,17 +858,25 @@ OLAPStatus ColumnReader::skip(uint64_t row_count) {
     return OLAP_SUCCESS;
 }
 
-OLAPStatus ColumnReader::next() {
+OLAPStatus ColumnReader::next_vector(
+            ColumnVector* column_vector,
+            uint32_t size,
+            MemPool* mem_pool) {
     OLAPStatus res = OLAP_SUCCESS;
-
+    column_vector->set_is_null(_is_null);
     if (NULL != _present_reader) {
-        char value = '\0';
-        res = _present_reader->next(&value);
-
-        if (OLAP_SUCCESS == res) {
-            _value_present = (1 == value);
+        column_vector->set_no_nulls(false);
+        for (uint32_t i = 0; i < size; ++i) {
+            bool value = false;
+            res = _present_reader->next((char*)&value);
+            if (OLAP_SUCCESS != res) {
+                break;
+            }
+            _is_null[i] = value;
         }
-        OLAP_LOG_DEBUG("column_id: %d, _value_present: %d", _column_unique_id, _value_present);
+        _stats->bytes_read += size;
+    } else {
+        column_vector->set_no_nulls(true);
     }
 
     return res;
@@ -730,20 +906,23 @@ uint64_t ColumnReader::_count_none_nulls(uint64_t rows) {
 TinyColumnReader::TinyColumnReader(uint32_t column_id, uint32_t column_unique_id) : 
         ColumnReader(column_id, column_unique_id),
         _eof(false),
-        _data_reader(NULL) {
-}
+        _values(NULL),
+        _data_reader(NULL) {}
 
 TinyColumnReader::~TinyColumnReader() {
     SAFE_DELETE(_data_reader);
 }
 
-OLAPStatus TinyColumnReader::init(std::map* streams) {
+OLAPStatus TinyColumnReader::init(
+        std::map* streams,
+        int size, MemPool* mem_pool,
+        OlapReaderStatistics* stats) {
     if (NULL == streams) {
         OLAP_LOG_WARNING("input streams is NULL");
         return OLAP_ERR_INPUT_PARAMETER_ERROR;
     }
 
-    ColumnReader::init(streams);
+    ColumnReader::init(streams, size, mem_pool, stats);
     ReadOnlyFileStream* data_stream = extract_stream(_column_unique_id,
                                       StreamInfoMessage::DATA,
                                       streams);
@@ -753,6 +932,7 @@ OLAPStatus TinyColumnReader::init(std::map* str
         return OLAP_ERR_COLUMN_STREAM_NOT_EXIST;
     }
 
+    _values = reinterpret_cast(mem_pool->allocate(size));
     _data_reader = new(std::nothrow) RunLengthByteReader(data_stream);
 
     if (NULL == _data_reader) {
@@ -790,24 +970,38 @@ OLAPStatus TinyColumnReader::skip(uint64_t row_count) {
     return _data_reader->skip(_count_none_nulls(row_count));
 }
 
-OLAPStatus TinyColumnReader::next() {
-#ifndef PERFORMANCE
-
-    if (NULL == _data_reader) {
-        OLAP_LOG_WARNING("reader not init.");
-        return OLAP_ERR_NOT_INITED;
+OLAPStatus TinyColumnReader::next_vector(
+        ColumnVector* column_vector,
+        uint32_t size,
+        MemPool* mem_pool) {
+    OLAPStatus res = ColumnReader::next_vector(column_vector, size, mem_pool);
+    if (OLAP_SUCCESS != res) {
+        if (OLAP_ERR_DATA_EOF == res) {
+            _eof = true;
+        }
+        return res;
     }
 
-#endif
-    OLAPStatus res = ColumnReader::next();
-
-    if (OLAP_SUCCESS == res) {
-        if (false == _value_present) {
-            res = _data_reader->next(&_value);
-        } else {
-            _value = 0;
+    bool* is_null = column_vector->is_null();
+    column_vector->set_col_data(_values);
+    if (column_vector->no_nulls()) {
+        for (uint32_t i = 0; i < size; ++i) {
+            res = _data_reader->next(_values + i);
+            if (OLAP_SUCCESS != res) {
+                break;
+            }
+        }
+    } else {
+        for (uint32_t i = 0; i < size; ++i) {
+            if (!is_null[i]) {
+                res = _data_reader->next(_values + i);
+                if (OLAP_SUCCESS != res) {
+                    break;
+                }
+            }
         }
     }
+    _stats->bytes_read += size;
 
     if (OLAP_ERR_DATA_EOF == res) {
         _eof = true;
@@ -818,9 +1012,10 @@ OLAPStatus TinyColumnReader::next() {
 
 DecimalColumnReader::DecimalColumnReader(uint32_t column_id, uint32_t column_unique_id) : 
         ColumnReader(column_id, column_unique_id),
+        _eof(false),
+        _values(NULL),
         _int_reader(NULL),
         _frac_reader(NULL) {
-        _value = {0, 0};
 }
 
 DecimalColumnReader::~DecimalColumnReader() {
@@ -828,14 +1023,20 @@ DecimalColumnReader::~DecimalColumnReader() {
     SAFE_DELETE(_frac_reader);
 }
 
-OLAPStatus DecimalColumnReader::init(std::map* streams) {
+OLAPStatus DecimalColumnReader::init(
+        std::map* streams,
+        int size, MemPool* mem_pool,
+        OlapReaderStatistics* stats) {
     if (NULL == streams) {
         OLAP_LOG_WARNING("input streams is NULL");
         return OLAP_ERR_INPUT_PARAMETER_ERROR;
     }
 
     // reset stream and reader
-    ColumnReader::init(streams);
+    ColumnReader::init(streams, size, mem_pool, stats);
+
+    _values = reinterpret_cast(mem_pool->allocate(size * sizeof(decimal12_t)));
+
     // 从map中找到需è¦çš„æµï¼ŒStringColumnReader的数æ®åº”该由一æ¡DATAæµå’Œä¸€æ¡LENGTHæµç»„æˆ
     ReadOnlyFileStream* int_stream = extract_stream(_column_unique_id,
                                      StreamInfoMessage::DATA,
@@ -874,16 +1075,6 @@ OLAPStatus DecimalColumnReader::init(std::map*
     return OLAP_SUCCESS;
 }
 
-OLAPStatus DecimalColumnReader::attach(RowCursor* cursor) {
-    OLAPStatus res;
-    if (true == _value_present) {
-        res = cursor->set_null(_column_id);
-    } else {
-        res = cursor->attach_by_index(_column_id, reinterpret_cast(&_value), false);
-    }
-    return res;
-}
-
 OLAPStatus DecimalColumnReader::seek(PositionProvider* positions) {
     OLAPStatus res;
     if (NULL == _present_reader) {
@@ -937,76 +1128,90 @@ OLAPStatus DecimalColumnReader::skip(uint64_t row_count) {
     return OLAP_SUCCESS;
 }
 
-OLAPStatus DecimalColumnReader::next() {
-    OLAPStatus res = ColumnReader::next();
-    if (OLAP_SUCCESS == res) {
-        if (false == _value_present) {
-            int64_t value;
-            OLAPStatus res = _int_reader->next(&value);
+OLAPStatus DecimalColumnReader::next_vector(
+        ColumnVector* column_vector,
+        uint32_t size,
+        MemPool* mem_pool) {
+    OLAPStatus res = ColumnReader::next_vector(column_vector, size, mem_pool);
+    if (OLAP_SUCCESS != res) {
+        if (OLAP_ERR_DATA_EOF == res) {
+            _eof = true;
+        }
+        return res;
+    }
 
+    bool* is_null = column_vector->is_null();
+    column_vector->set_col_data(_values);
+
+    if (column_vector->no_nulls()) {
+        for (uint32_t i = 0; i < size; ++i) {
+            int64_t value = 0;
+            OLAPStatus res = _int_reader->next(&value);
             if (OLAP_SUCCESS != res) {
                 OLAP_LOG_WARNING("fail to read decimal int part");
-                return res;
+                break;
             }
+            _values[i].integer = value;
 
-            _value._int = value;
             res = _frac_reader->next(&value);
-
             if (OLAP_SUCCESS != res) {
                 OLAP_LOG_WARNING("fail to read decimal frac part");
-                return res;
+                break;
+            }
+            _values[i].fraction = value;
+        }
+    } else {
+        for (uint32_t i = 0; i < size; ++i) {
+            int64_t value = 0;
+            if (!is_null[i]) {
+                OLAPStatus res = _int_reader->next(&value);
+                if (OLAP_SUCCESS != res) {
+                    OLAP_LOG_WARNING("fail to read decimal int part");
+                    break;
+                }
+                _values[i].integer = value;
+
+                res = _frac_reader->next(&value);
+                if (OLAP_SUCCESS != res) {
+                    OLAP_LOG_WARNING("fail to read decimal frac part");
+                    break;
+                }
+                _values[i].fraction = value;
             }
-	    _value._frac = value;
-        } else {
-            _value._int = 0;
-            _value._frac = 0;
         }
     }
+    _stats->bytes_read += sizeof(decimal12_t) * size;
 
     return res;
-    
-
-    /*
-    int64_t value;
-    OLAPStatus res = _int_reader->next(&value);
-
-    if (OLAP_SUCCESS != res) {
-        OLAP_LOG_WARNING("fail to read decimal int part");
-        return res;
-    }
-
-    _value._int = value;
-    res = _frac_reader->next(&value);
-
-    if (OLAP_SUCCESS != res) {
-        OLAP_LOG_WARNING("fail to read decimal frac part");
-        return res;
-    }
-
-    _value._frac = value;
-    return res;
-    */
 }
 
 LargeIntColumnReader::LargeIntColumnReader(uint32_t column_id, uint32_t column_unique_id) : 
         ColumnReader(column_id, column_unique_id),
+        _eof(false),
+        _values(NULL),
         _high_reader(NULL),
-        _low_reader(NULL) {
-}
+        _low_reader(NULL) {}
 
 LargeIntColumnReader::~LargeIntColumnReader() {
     SAFE_DELETE(_high_reader);
     SAFE_DELETE(_low_reader);
 }
 
-OLAPStatus LargeIntColumnReader::init(std::map* streams) {
+OLAPStatus LargeIntColumnReader::init(
+        std::map* streams,
+        int size, MemPool* mem_pool,
+        OlapReaderStatistics* stats) {
     if (NULL == streams) {
         OLAP_LOG_WARNING("input streams is NULL");
         return OLAP_ERR_INPUT_PARAMETER_ERROR;
     }
 
     // reset stream and reader
-    ColumnReader::init(streams);
+    ColumnReader::init(streams, size, mem_pool, stats);
+
+    _values = reinterpret_cast(
+        mem_pool->try_allocate_aligned(size * sizeof(int128_t), alignof(int128_t)));
+
     // 从map中找到需è¦çš„æµï¼ŒLargeIntColumnReader的数æ®åº”该由一æ¡DATAæµç»„æˆ
     ReadOnlyFileStream* high_stream = extract_stream(_column_unique_id,
                                      StreamInfoMessage::DATA,
@@ -1041,16 +1246,6 @@ OLAPStatus LargeIntColumnReader::init(std::map*
     return OLAP_SUCCESS;
 }
 
-OLAPStatus LargeIntColumnReader::attach(RowCursor* cursor) {
-    OLAPStatus res;
-    if (true == _value_present) {
-        res = cursor->set_null(_column_id);
-    } else {
-        res = cursor->attach_by_index(_column_id, reinterpret_cast(&_value), false);
-    }
-    return res;
-}
-
 OLAPStatus LargeIntColumnReader::seek(PositionProvider* positions) {
     OLAPStatus res;
     if (NULL == _present_reader) {
@@ -1103,46 +1298,59 @@ OLAPStatus LargeIntColumnReader::skip(uint64_t row_count) {
     return OLAP_SUCCESS;
 }
 
-OLAPStatus LargeIntColumnReader::next() {
-    OLAPStatus res = ColumnReader::next();
-    if (OLAP_SUCCESS == res) {
-        if (false == _value_present) {
-            int64_t* value = (int64_t*)(&_value);
+OLAPStatus LargeIntColumnReader::next_vector(
+        ColumnVector* column_vector,
+        uint32_t size,
+        MemPool* mem_pool) {
+    OLAPStatus res = ColumnReader::next_vector(column_vector, size, mem_pool);
+    if (OLAP_SUCCESS != res) {
+        if (OLAP_ERR_DATA_EOF == res) {
+            _eof = true;
+        }
+        return res;
+    }
 
+    bool* is_null = column_vector->is_null();
+    column_vector->set_col_data(_values);
+
+    if (column_vector->no_nulls()) {
+        for (uint32_t i = 0; i < size; ++i) {
+            int64_t* value = NULL;
+            value = (int64_t*)(_values + i);
             res = _high_reader->next(value);
             if (OLAP_SUCCESS != res) {
-                OLAP_LOG_WARNING("fail to read large int high part. [res=%d]", res);
-                return res;
+                OLAP_LOG_WARNING("fail to read decimal int part");
+                break;
             }
 
             res = _low_reader->next(++value);
             if (OLAP_SUCCESS != res) {
-                OLAP_LOG_WARNING("fail to read large int low part. [res=%d]", res);
-                return res;
+                OLAP_LOG_WARNING("fail to read decimal frac part");
+                break;
             }
+        }
+    } else {
+        for (uint32_t i = 0; i < size; ++i) {
+            int64_t* value = NULL;
+            if (!is_null[i]) {
+                value = (int64_t*)(_values + i);
+                res = _high_reader->next(value);
+                if (OLAP_SUCCESS != res) {
+                    OLAP_LOG_WARNING("fail to read decimal int part");
+                    break;
+                }
 
-        } else {
-            _value = 0;
+                res = _low_reader->next(++value);
+                if (OLAP_SUCCESS != res) {
+                    OLAP_LOG_WARNING("fail to read decimal frac part");
+                    break;
+                }
+            }
         }
     }
+    _stats->bytes_read += 16 * size;
+
     return res;
-
-
-    /*
-    int64_t* value = (int64_t*)(&_value);
-
-    OLAPStatus res = _high_reader->next(value);
-    if (OLAP_SUCCESS != res) {
-        OLAP_LOG_WARNING("fail to read large int high part. [res=%d]", res);
-        return res;
-    }
-
-    res = _low_reader->next(++value);
-    if (OLAP_SUCCESS != res) {
-        OLAP_LOG_WARNING("fail to read large int low part. [res=%d]", res);
-        return res;
-    }
-    */
 }
 
 }  // namespace column_file
diff --git a/be/src/olap/column_file/column_reader.h b/be/src/olap/column_file/column_reader.h
index 17144ea2e2..4f94252417 100644
--- a/be/src/olap/column_file/column_reader.h
+++ b/be/src/olap/column_file/column_reader.h
@@ -25,6 +25,8 @@
 #include "olap/olap_common.h"
 #include "olap/olap_define.h"
 #include "olap/row_cursor.h"
+#include "runtime/vectorized_row_batch.h"
+#include "util/date_func.h"
 
 namespace palo {
 namespace column_file {
@@ -77,7 +79,8 @@ public:
      * @param  is_sign 所读å–çš„æ•°æ˜¯å¦æœ‰ç¬¦å·
      * @return         [description]
      */
-    OLAPStatus init(std::map* streams, bool is_sign);
+    OLAPStatus init(std::map* streams,
+                    bool is_sign);
     // 将内部指针定ä½åˆ°positions
     OLAPStatus seek(PositionProvider* positions);
     // 将内部指针å‘åŽç§»åЍrow_count行
@@ -100,13 +103,18 @@ public:
     StringColumnDirectReader(uint32_t column_unique_id, uint32_t dictionary_size);
     ~StringColumnDirectReader();
 
-    OLAPStatus init(std::map* streams);
+    OLAPStatus init(std::map* streams,
+                    int size, MemPool* mem_pool);
     OLAPStatus seek(PositionProvider* positions);
     OLAPStatus skip(uint64_t row_count);
     // 返回当å‰è¡Œçš„æ•°æ®ï¼Œå¹¶å°†å†…部指针å‘åŽç§»åЍ
     // buffer - 返回数æ®çš„缓冲区
     // length - 输入时作为缓存区大å°ï¼Œè¿”回时给出字符串的大å°
     OLAPStatus next(char* buffer, uint32_t* length);
+    OLAPStatus next_vector(ColumnVector* column_vector,
+                           uint32_t size,
+                           MemPool* mem_pool,
+                           int64_t* read_bytes);
 
     size_t get_buffer_size() {
         return sizeof(RunLengthByteReader);
@@ -115,6 +123,7 @@ public:
 private:
     bool _eof;
     uint32_t _column_unique_id;
+    StringSlice* _values;
     ReadOnlyFileStream* _data_stream;
     RunLengthIntegerReader* _length_reader;
 };
@@ -131,10 +140,15 @@ class StringColumnDictionaryReader {
 public:
     StringColumnDictionaryReader(uint32_t column_unique_id, uint32_t dictionary_size);
     ~StringColumnDictionaryReader();
-    OLAPStatus init(std::map* streams);
+    OLAPStatus init(std::map* streams,
+                    int size, MemPool* mem_pool);
     OLAPStatus seek(PositionProvider* positions);
     OLAPStatus skip(uint64_t row_count);
     OLAPStatus next(char* buffer, uint32_t* length);
+    OLAPStatus next_vector(ColumnVector* column_vector,
+                           uint32_t size,
+                           MemPool* mem_pool,
+                           int64_t* read_bytes);
 
     size_t get_buffer_size() {
         return sizeof(RunLengthByteReader) + _dictionary_size;
@@ -144,6 +158,7 @@ private:
     bool _eof;
     uint32_t _dictionary_size;
     uint32_t _column_unique_id;
+    StringSlice* _values;
     char* _read_buffer;
     //uint64_t _dictionary_size;
     //uint64_t* _offset_dictionary;   // ç”¨æ¥æŸ¥æ‰¾å“应数æ®çš„æ•°å­—对应的offset
@@ -180,13 +195,9 @@ public:
     // ColumnReaderä»…åˆå§‹åŒ–ä¸€æ¬¡ï¼Œæ¯æ¬¡ä½¿ç”¨æ—¶åˆ†é…新的对象。
     // Input:
     //     streams - 输入stream
-    virtual OLAPStatus init(std::map* streams);
-
-    // 将内部数æ®attach到cursor, ColumnReader必须将需è¦è¿”回的数æ®ç¼“存在内部
-    // çš„data_buffer, å†é€šè¿‡cursorçš„attach_by_index返回该数æ®
-    // 峿‰§è¡Œcursor->attach_by_index(_column_id, data_buffer);
-    // Readeræ— åºattach,因为readeråªç”¨æ¥åˆ¤æ–­å½“å‰valueæ˜¯ä¸æ˜¯ä¸ºç©º
-    virtual OLAPStatus attach(RowCursor* cursor) = 0;
+    virtual OLAPStatus init(std::map* streams,
+                            int size, MemPool* mem_pool,
+                            OlapReaderStatistics* stats);
 
     // 设置下一个返回的数æ®çš„ä½ç½®
     // positions是å„个列需è¦seekçš„ä½ç½®, ColumnReader通过(*positions)[_column_unique_id]
@@ -197,20 +208,9 @@ public:
     // 如果上层skip过而底层ä¸skip,next判断空ä¸ç©ºä¸æ˜¯ä¸å‡†äº†å—
     virtual OLAPStatus skip(uint64_t row_count);
 
-    // next会将å˜é‡è¯»å–至_value_present
-    // æ­¤å˜é‡å…¶å®žæœ¬èº«ä¸ä¼šè¢«å¤–部使用,而是é…åˆå…¶ä»–列类型,
-    // æ¥æä¾› 空/éžç©º 指示
-    virtual OLAPStatus next();
-
-    // get vector for row batch
-    virtual OLAPStatus next_vector(
-            uint8_t* batch_buf,
-            uint32_t batch_buf_len,
-            uint32_t start_row_in_block,
-            uint32_t batch_size,
-            std::vector& offset) {
-        return OLAP_SUCCESS;
-    }
+    virtual OLAPStatus next_vector(ColumnVector* column_vector,
+                                   uint32_t size,
+                                   MemPool* mem_pool);
 
     uint32_t column_unique_id() {
         return _column_unique_id;
@@ -231,23 +231,156 @@ protected:
     uint64_t _count_none_nulls(uint64_t rows);
 
     bool _value_present;
+    bool* _is_null;
     uint32_t _column_id;        // column在schema内的id
     uint32_t _column_unique_id; // column的唯一id
     BitFieldReader* _present_reader;   // NULLable的字段的NULL值
     std::vector _sub_readers;
+    OlapReaderStatistics* _stats = nullptr;
 };
 
 class DefaultValueReader : public ColumnReader {
 public:
-    DefaultValueReader(uint32_t column_id, uint32_t column_unique_id, std::string default_value) :
-        ColumnReader(column_id, column_unique_id),
-        _default_value(default_value) {
-    }
-    virtual OLAPStatus init(std::map* streams) {
-        return OLAP_SUCCESS;
-    }
-    virtual OLAPStatus attach(RowCursor* cursor) {
-        cursor->get_mutable_field_by_index(_column_id)->from_string(_default_value.c_str());
+    DefaultValueReader(uint32_t column_id, uint32_t column_unique_id,
+                       std::string default_value, FieldType type, int length)
+        : ColumnReader(column_id, column_unique_id),
+          _default_value(default_value), _values(NULL),
+          _type(type), _length(length) {}
+    
+    virtual ~DefaultValueReader() {}
+
+    virtual OLAPStatus init(std::map* streams,
+                            int size, MemPool* mem_pool,
+                            OlapReaderStatistics* stats) {
+        switch (_type) {
+            case OLAP_FIELD_TYPE_TINYINT: {
+                _values = reinterpret_cast(mem_pool->allocate(size * sizeof(int8_t)));
+                int32_t value = 0;
+                std::stringstream ss(_default_value);
+                ss >> value;
+                for (int i = 0; i < size; ++i) {
+                    ((int8_t*)_values)[i] = value;
+                }
+                break;
+            }
+            case OLAP_FIELD_TYPE_SMALLINT: {
+                _values = reinterpret_cast(mem_pool->allocate(size * sizeof(int16_t)));
+                int16_t value = 0;
+                std::stringstream ss(_default_value);
+                ss >> value;
+                for (int i = 0; i < size; ++i) {
+                    ((int16_t*)_values)[i] = value;
+                }
+                break;
+            }
+            case OLAP_FIELD_TYPE_INT: {
+                _values = reinterpret_cast(mem_pool->allocate(size * sizeof(int32_t)));
+                int32_t value = 0;
+                std::stringstream ss(_default_value);
+                ss >> value;
+                for (int i = 0; i < size; ++i) {
+                    ((int32_t*)_values)[i] = value;
+                }
+                break;
+            }
+            case OLAP_FIELD_TYPE_BIGINT: {
+                _values = reinterpret_cast(mem_pool->allocate(size * sizeof(int64_t)));
+                int64_t value = 0;
+                std::stringstream ss(_default_value);
+                ss >> value;
+                for (int i = 0; i < size; ++i) {
+                    ((int64_t*)_values)[i] = value;
+                }
+                break;
+            }
+            case OLAP_FIELD_TYPE_LARGEINT: {
+                _values = reinterpret_cast(
+                    mem_pool->try_allocate_aligned(size * sizeof(int128_t), alignof(int128_t)));
+                int128_t value = 0;
+                std::stringstream ss(_default_value);
+                ss >> value;
+                for (int i = 0; i < size; ++i) {
+                    ((int128_t*)_values)[i] = value;
+                }
+                break;
+            }
+            case OLAP_FIELD_TYPE_FLOAT: {
+                _values = reinterpret_cast(mem_pool->allocate(size * sizeof(float)));
+                float value = 0;
+                std::stringstream ss(_default_value);
+                ss >> value;
+                for (int i = 0; i < size; ++i) {
+                    ((float*)_values)[i] = value;
+                }
+                break;
+            }
+            case OLAP_FIELD_TYPE_DOUBLE: {
+                _values = reinterpret_cast(mem_pool->allocate(size * sizeof(double)));
+                double value = 0;
+                std::stringstream ss(_default_value);
+                ss >> value;
+                for (int i = 0; i < size; ++i) {
+                    ((double*)_values)[i] = value;
+                }
+                break;
+            }
+            case OLAP_FIELD_TYPE_DECIMAL: {
+                _values = reinterpret_cast(mem_pool->allocate(size * sizeof(decimal12_t)));
+                decimal12_t value(0, 0);
+                value.from_string(_default_value);
+                for (int i = 0; i < size; ++i) {
+                    ((decimal12_t*)_values)[i] = value;
+                }
+                break;
+            }
+            case OLAP_FIELD_TYPE_CHAR: {
+                _values =
+                    reinterpret_cast(mem_pool->allocate(size * sizeof(StringSlice)));
+                int32_t length = _length;
+                char* string_buffer = reinterpret_cast(mem_pool->allocate(size * length));
+                memset(string_buffer, 0, size * length);
+                for (int i = 0; i < size; ++i) {
+                    memory_copy(string_buffer, _default_value.c_str(), _default_value.length());
+                    ((StringSlice*)_values)[i].size = length;
+                    ((StringSlice*)_values)[i].data = string_buffer;
+                    string_buffer += length;
+                }
+                break;
+            }
+            case OLAP_FIELD_TYPE_VARCHAR: {
+                _values =
+                    reinterpret_cast(mem_pool->allocate(size * sizeof(StringSlice)));
+                int32_t length = _default_value.length();
+                char* string_buffer = reinterpret_cast(mem_pool->allocate(size * length));
+                for (int i = 0; i < size; ++i) {
+                    memory_copy(string_buffer, _default_value.c_str(), length);
+                    ((StringSlice*)_values)[i].size = length;
+                    ((StringSlice*)_values)[i].data = string_buffer;
+                    string_buffer += length;
+                }
+                break;
+            }
+            case OLAP_FIELD_TYPE_DATE: {
+                _values =
+                    reinterpret_cast(mem_pool->allocate(size * sizeof(uint24_t)));
+                uint24_t value = timestamp_from_date(_default_value);
+                for (int i = 0; i < size; ++i) {
+                    ((uint24_t*)_values)[i] = value;
+                }
+                break;
+            }
+            case OLAP_FIELD_TYPE_DATETIME: {
+                _values =
+                    reinterpret_cast(mem_pool->allocate(size * sizeof(uint64_t)));
+                uint64_t value = timestamp_from_datetime(_default_value);
+                for (int i = 0; i < size; ++i) {
+                    ((uint64_t*)_values)[i] = value;
+                }
+                break;
+            }
+            default: break;
+        }
+        _stats = stats;
         return OLAP_SUCCESS;
     }
     virtual OLAPStatus seek(PositionProvider* positions) {
@@ -256,11 +389,21 @@ public:
     virtual OLAPStatus skip(uint64_t row_count) {
         return OLAP_SUCCESS;
     }
-    virtual OLAPStatus next() {
+
+    virtual OLAPStatus next_vector(
+            ColumnVector* column_vector,
+            uint32_t size,
+            MemPool* mem_pool) {
+        column_vector->set_no_nulls(true);
+        column_vector->set_col_data(_values);
+        _stats->bytes_read += _length * size;
         return OLAP_SUCCESS;
     }
 private:
     std::string _default_value;
+    void* _values;
+    FieldType _type;
+    int32_t _length;
 };
 
 class NullValueReader : public ColumnReader {
@@ -268,21 +411,27 @@ public:
     NullValueReader(uint32_t column_id, uint32_t column_unique_id) :
         ColumnReader(column_id, column_unique_id) {
     }
-    virtual OLAPStatus init(std::map* streams) {
+    OLAPStatus init(std::map* streams,
+                            int size, MemPool* mem_pool,
+                            OlapReaderStatistics* stats) override {
+        _is_null = reinterpret_cast(mem_pool->allocate(size));
+        memset(_is_null, 1, size);
+        _stats = stats;
         return OLAP_SUCCESS;
     }
-    virtual OLAPStatus attach(RowCursor* cursor) {
-        OLAPStatus res = cursor->set_null(_column_id);  
-        return res;
-    }
     virtual OLAPStatus seek(PositionProvider* positions) {
         return OLAP_SUCCESS;
     }
     virtual OLAPStatus skip(uint64_t row_count) {
         return OLAP_SUCCESS;
     }
-    virtual OLAPStatus next() {
-        _value_present = true;
+    virtual OLAPStatus next_vector(
+            ColumnVector* column_vector,
+            uint32_t size,
+            MemPool* mem_pool) {
+        column_vector->set_no_nulls(false);
+        column_vector->set_is_null(_is_null);
+        _stats->bytes_read += size;
         return OLAP_SUCCESS;
     }
 };
@@ -293,48 +442,14 @@ public:
     TinyColumnReader(uint32_t column_id, uint32_t column_unique_id);
     virtual ~TinyColumnReader();
 
-    virtual OLAPStatus init(std::map* streams);
-    virtual OLAPStatus attach(RowCursor* cursor) {
-        OLAPStatus res;
-        if (true == _value_present) {
-            res = cursor->set_null(_column_id);
-        } else {
-            res = cursor->attach_by_index(_column_id, &_value, false);
-        }
-        return res;
-    }
+    virtual OLAPStatus init(std::map* streams,
+                            int size, MemPool* mem_pool,
+                            OlapReaderStatistics* stats);
     virtual OLAPStatus seek(PositionProvider* positions);
     virtual OLAPStatus skip(uint64_t row_count);
-    virtual OLAPStatus next();
-
-    virtual OLAPStatus next_vector(
-            uint8_t* batch_buf,
-            uint32_t batch_buf_len,
-            uint32_t start_row_in_block,
-            uint32_t batch_size,
-            std::vector& offset) {
-        char* return_value = reinterpret_cast(batch_buf + offset.front());
-
-        for (uint32_t i = start_row_in_block; i < batch_size; i++) {
-            OLAPStatus res = ColumnReader::next();
-
-            if (OLAP_SUCCESS == res) {
-                if (false == _value_present) {
-                    _data_reader->next(return_value);
-                } else {
-                    *return_value = 0;
-                }
-
-                return_value++;
-            }
-
-            if (OLAP_ERR_DATA_EOF == res) {
-                _eof = true;
-            }
-        }
-
-        return OLAP_SUCCESS;
-    }
+    virtual OLAPStatus next_vector(ColumnVector* column_vector,
+                                   uint32_t size,
+                                   MemPool* mem_pool);
 
     virtual size_t get_buffer_size() {
         return sizeof(RunLengthByteReader);
@@ -342,7 +457,7 @@ public:
 
 private:
     bool _eof;
-    char _value;
+    char* _values;
     RunLengthByteReader* _data_reader;
 };
 
@@ -352,31 +467,25 @@ class IntegerColumnReaderWrapper : public ColumnReader {
 public:
     IntegerColumnReaderWrapper(uint32_t column_id, uint32_t column_unique_id) :
         ColumnReader(column_id, column_unique_id),
-        _reader(column_unique_id),
+        _reader(column_unique_id), _values(NULL),
         _eof(false) {
     }
 
     virtual ~IntegerColumnReaderWrapper() {}
 
-    virtual OLAPStatus init(std::map* streams) {
-        OLAPStatus res = ColumnReader::init(streams);
+    virtual OLAPStatus init(std::map* streams,
+                            int size, MemPool* mem_pool,
+                            OlapReaderStatistics* stats) {
+        OLAPStatus res = ColumnReader::init(streams, size, mem_pool, stats);
 
         if (OLAP_SUCCESS == res) {
             res = _reader.init(streams, is_sign);
         }
 
-        return res;
-    }
-    virtual OLAPStatus attach(RowCursor* cursor) {
-        OLAPStatus res;
-        if (true == _value_present) {
-            res = cursor->set_null(_column_id);
-        } else {
-            res = cursor->attach_by_index(_column_id, reinterpret_cast(&_value), false);
-        }
-        return res;
-    }
+        _values = reinterpret_cast(mem_pool->allocate(size * sizeof(T)));
 
+        return res;
+    }
     virtual OLAPStatus seek(PositionProvider* positions) {
         OLAPStatus res;
         if (NULL == _present_reader) {
@@ -402,64 +511,57 @@ public:
     virtual OLAPStatus skip(uint64_t row_count) {
         return _reader.skip(_count_none_nulls(row_count));
     }
-    virtual OLAPStatus next() {
-        OLAPStatus res = ColumnReader::next();
 
-        if (OLAP_SUCCESS == res) {
-            if (false == _value_present) {
+    virtual OLAPStatus next_vector(
+            ColumnVector* column_vector,
+            uint32_t size,
+            MemPool* mem_pool) {
+        OLAPStatus res = ColumnReader::next_vector(column_vector, size, mem_pool);
+        if (OLAP_SUCCESS != res) {
+            if (OLAP_ERR_DATA_EOF == res) {
+                _eof = true;
+            }
+            return res;
+        }
+
+        column_vector->set_col_data(_values);
+        if (column_vector->no_nulls()) {
+            for (uint32_t i = 0; i < size; ++i) {
                 int64_t value = 0;
                 res = _reader.next(&value);
-                _value = value;
-            } else {
-                _value = 0;
+                if (OLAP_SUCCESS != res) {
+                    break;
+                }
+                _values[i] = value;
+            }
+        } else {
+            bool* is_null = column_vector->is_null();
+            for (uint32_t i = 0; i < size; ++i) {
+                int64_t value = 0;
+                if (!is_null[i]) {
+                    res = _reader.next(&value);
+                    if (OLAP_SUCCESS != res) {
+                        break;
+                    }
+                }
+                _values[i] = value;
             }
         }
+        _stats->bytes_read += sizeof(T) * size;
 
         if (OLAP_ERR_DATA_EOF == res) {
             _eof = true;
         }
-
         return res;
     }
 
-    virtual OLAPStatus next_vector(
-            uint8_t* batch_buf,
-            uint32_t batch_buf_len,
-            uint32_t start_row_in_block,
-            uint32_t batch_size,
-            std::vector& offset) {
-        T* return_value = reinterpret_cast(batch_buf + offset.front());
-
-        for (uint32_t i = start_row_in_block; i < batch_size; i++) {
-            OLAPStatus res = ColumnReader::next();
-
-            if (OLAP_SUCCESS == res) {
-                if (false == _value_present) {
-                    int64_t value = 0;
-                    res = _reader.next(&value);
-                    *return_value = value;
-                } else {
-                    *return_value = 0;
-                }
-
-                return_value++;
-            }
-
-            if (OLAP_ERR_DATA_EOF == res) {
-                _eof = true;
-            }
-        }
-
-        return OLAP_SUCCESS;
-    }
-
     virtual size_t get_buffer_size() {
         return sizeof(RunLengthIntegerReader);
     }
 
 private:
     IntegerColumnReader _reader;  // 被包裹的真实读å–器
-    T _value;                     // 当å‰è¡Œè¯»å‡ºçš„值
+    T* _values;
     bool _eof;
 };
 
@@ -474,40 +576,25 @@ public:
             uint32_t string_length,
             uint32_t dictionary_size) :
             ColumnReader(column_id, column_unique_id),
-            _buf(NULL),
+            _eof(false),
             _reader(column_unique_id, dictionary_size),
             _string_length(string_length) {
     }
     virtual ~FixLengthStringColumnReader() {
-        SAFE_DELETE_ARRAY(_buf);
     }
 
-    virtual OLAPStatus init(std::map* streams) {
-        _buf = new(std::nothrow) char [_string_length];
-
-        if (NULL == _buf) {
-            return OLAP_ERR_MALLOC_ERROR;
-        }
-
-        OLAPStatus res = ColumnReader::init(streams);
+    virtual OLAPStatus init(std::map* streams,
+                            int size, MemPool* mem_pool,
+                            OlapReaderStatistics* stats) {
+        OLAPStatus res = ColumnReader::init(streams, size, mem_pool, stats);
 
         if (OLAP_SUCCESS == res) {
-            res = _reader.init(streams);
+            res = _reader.init(streams, size, mem_pool);
         }
 
         return res;
     }
 
-    virtual OLAPStatus attach(RowCursor* cursor) {
-        OLAPStatus res;
-        if (true == _value_present) {
-            res = cursor->set_null(_column_id);
-        } else {
-            res = cursor->attach_by_index(_column_id, _buf, false);
-        }
-        return res;
-    }
-
     virtual OLAPStatus seek(PositionProvider* positions) {
         OLAPStatus res;
         if (NULL == _present_reader) {
@@ -533,50 +620,20 @@ public:
     virtual OLAPStatus skip(uint64_t row_count) {
         return _reader.skip(_count_none_nulls(row_count));
     }
-    virtual OLAPStatus next() {
-        uint32_t buf_size = _string_length;
-        OLAPStatus res = ColumnReader::next();
-
-        if (OLAP_SUCCESS == res) {
-            if (false == _value_present) {
-                res = _reader.next(_buf, &buf_size);
-            } else {
-                memset(_buf, 0, buf_size);
-            }
-        }
-
-        // 将多余的buffer设置为0
-        if (OLAP_SUCCESS == res) {
-            memset(&_buf[buf_size], 0, _string_length - buf_size);
-        }
-
-        return res;
-    }
-
     virtual OLAPStatus next_vector(
-            uint8_t* batch_buf,
-            uint32_t batch_buf_len,
-            uint32_t start_row_in_block,
-            uint32_t batch_size,
-            std::vector& offset) {
-        OLAPStatus res = OLAP_SUCCESS;
-        char* return_value = reinterpret_cast(batch_buf + offset.front());
+            ColumnVector* column_vector,
+            uint32_t size,
+            MemPool* mem_pool) {
 
-        for (uint32_t i = start_row_in_block; i < batch_size; i++) {
-            res = this->next();
-
-            if (OLAP_SUCCESS == res) {
-                memcpy(return_value, _buf, _string_length);
-            } else if (OLAP_ERR_COLUMN_STREAM_EOF == res) {
-                memcpy(return_value, _buf, _string_length);
-                break;
-            } else {
-                OLAP_LOG_WARNING("fail to get next. [res=%d]", res);
-                return res;
+        OLAPStatus res = ColumnReader::next_vector(column_vector, size, mem_pool);
+        if (OLAP_SUCCESS != res) {
+            if (OLAP_ERR_DATA_EOF == res) {
+                _eof = true;
             }
+            return res;
         }
 
-        return OLAP_SUCCESS;
+        return _reader.next_vector(column_vector, size, mem_pool, &_stats->bytes_read);
     }
 
     virtual size_t get_buffer_size() {
@@ -584,7 +641,7 @@ public:
     }
 
 private:
-    char* _buf;
+    bool _eof;
     ReaderClass _reader;
     uint32_t _string_length;
 };
@@ -599,35 +656,20 @@ public:
             uint32_t max_length,
             uint32_t dictionary_size) :
             ColumnReader(column_id, column_unique_id),
-            _buf(NULL),
+            _eof(false),
             _reader(column_unique_id, dictionary_size),
-            _max_length(max_length),
-            _real_length(NULL) {
+            _max_length(max_length) {
     }
     virtual ~VarStringColumnReader() {
-        SAFE_DELETE_ARRAY(_buf);
     }
-    virtual OLAPStatus init(std::map* streams) {
-        OLAPStatus res = ColumnReader::init(streams);
+    virtual OLAPStatus init(std::map* streams,
+                            int size, MemPool* mem_pool,
+                            OlapReaderStatistics* stats) {
+        OLAPStatus res = ColumnReader::init(streams, size, mem_pool, stats);
         if (OLAP_SUCCESS == res) {
-            res = _reader.init(streams);
+            res = _reader.init(streams, size, mem_pool);
         }
 
-        _buf = new(std::nothrow) char[_max_length];
-        if (NULL == _buf) {
-            OLAP_LOG_WARNING("fail to malloc buffer. [size=%u]", _max_length);
-            res = OLAP_ERR_MALLOC_ERROR;
-        }
-
-        _real_length = reinterpret_cast(_buf);
-        return res;
-    }
-
-    virtual OLAPStatus attach(RowCursor* cursor) {
-        if (true == _value_present) {
-            cursor->set_null(_column_id);
-        }
-        OLAPStatus res = cursor->attach_by_index(_column_id, _buf, false);
         return res;
     }
 
@@ -656,54 +698,20 @@ public:
     virtual OLAPStatus skip(uint64_t row_count) {
         return _reader.skip(_count_none_nulls(row_count));
     }
-    virtual OLAPStatus next() {
-        uint32_t buf_size = 0;
-        *_real_length = 0;
 
-        OLAPStatus res = ColumnReader::next();
-        if (OLAP_LIKELY(OLAP_SUCCESS == res)) {
-            if (false == _value_present) {
-                res = _reader.next(_buf + sizeof(VarCharField::LengthValueType), &buf_size);
-                if (OLAP_LIKELY(OLAP_SUCCESS == res)) {
-                    *_real_length = static_cast(buf_size);
-                }
-            } else {
-                *_real_length = 0;
-                memset(_buf, 0, sizeof(VarCharField::LengthValueType));
-                _buf[sizeof(VarCharField::LengthValueType)] = '\0';
-            }
-        }
-
-        return res;
-    }
     virtual OLAPStatus next_vector(
-            uint8_t* batch_buf,
-            uint32_t batch_buf_len,
-            uint32_t start_row_in_block,
-            uint32_t batch_size,
-            std::vector& offset) {
-        OLAPStatus res = OLAP_SUCCESS;
-        VarCharField::OffsetValueType* offset_value =
-            reinterpret_cast(batch_buf + offset[0]);
-        char* string_value = reinterpret_cast(batch_buf + offset[1]);
-        uint32_t start_offset = offset[1];
-
-        for (uint32_t i = start_row_in_block; i < batch_size; i++) {
-            res = this->next();
-            if (OLAP_SUCCESS != res && OLAP_ERR_COLUMN_VALUE_NULL != res) {
-                OLAP_LOG_WARNING("fail to get next. [res=%d]", res);
-                return res;
+            ColumnVector* column_vector,
+            uint32_t size,
+            MemPool* mem_pool) {
+        OLAPStatus res = ColumnReader::next_vector(column_vector, size, mem_pool);
+        if (OLAP_SUCCESS != res) {
+            if (OLAP_ERR_DATA_EOF == res) {
+                _eof = true;
             }
-
-            *offset_value = start_offset;
-            offset_value++;
-            uint32_t string_value_len = *_real_length + sizeof(VarCharField::LengthValueType);
-            memcpy(string_value, _buf, string_value_len);
-            string_value += string_value_len;
-            start_offset += string_value_len;
+            return res;
         }
 
-        return OLAP_SUCCESS;
+        return _reader.next_vector(column_vector, size, mem_pool, &_stats->bytes_read);
     }
 
     virtual size_t get_buffer_size() {
@@ -711,10 +719,9 @@ public:
     }
 
 private:
-    char* _buf;
+    bool _eof;
     ReaderClass _reader;
     uint32_t _max_length;
-    VarCharField::LengthValueType* _real_length;
 };
 
 template 
@@ -723,19 +730,21 @@ public:
     FloatintPointColumnReader(uint32_t column_id, uint32_t column_unique_id) :
             ColumnReader(column_id, column_unique_id),
             _eof(false),
-            _data_stream(NULL) {
-    }
+            _data_stream(NULL),
+            _values(NULL) {}
 
     virtual ~FloatintPointColumnReader() {}
 
-    virtual OLAPStatus init(std::map* streams) {
+    virtual OLAPStatus init(std::map* streams,
+                            int size, MemPool* mem_pool,
+                            OlapReaderStatistics* stats) {
         if (NULL == streams) {
             OLAP_LOG_WARNING("input streams is NULL");
             return OLAP_ERR_INPUT_PARAMETER_ERROR;
         }
 
         // reset stream and reader
-        ColumnReader::init(streams);
+        ColumnReader::init(streams, size, mem_pool, stats);
         _data_stream = extract_stream(_column_unique_id,
                        StreamInfoMessage::DATA,
                        streams);
@@ -745,18 +754,10 @@ public:
             return OLAP_ERR_COLUMN_STREAM_NOT_EXIST;
         }
 
+        _values = reinterpret_cast(mem_pool->allocate(size * sizeof(FLOAT_TYPE)));
+
         return OLAP_SUCCESS;
     }
-    virtual OLAPStatus attach(RowCursor* cursor) {        
-        OLAPStatus res;
-        if (true == _value_present) {
-            res = cursor->set_null(_column_id);
-        } else {
-            res = cursor->attach_by_index(_column_id,  reinterpret_cast(&_value), false);
-        }
-        return res;
-    }
-
     virtual OLAPStatus seek(PositionProvider* position) {
         if (NULL == position) {
             OLAP_LOG_WARNING("input positions is NULL");
@@ -799,22 +800,49 @@ public:
         return _data_stream->skip(skip_values_count * sizeof(FLOAT_TYPE));
     }
 
-    virtual OLAPStatus next() {
+    virtual OLAPStatus next_vector(
+            ColumnVector* column_vector,
+            uint32_t size,
+            MemPool* mem_pool) {
+
         if (NULL == _data_stream) {
             OLAP_LOG_WARNING("reader not init.");
             return OLAP_ERR_NOT_INITED;
         }
 
-        OLAPStatus res = ColumnReader::next();
+        OLAPStatus res = ColumnReader::next_vector(column_vector, size, mem_pool);
+        if (OLAP_SUCCESS != res) {
+            if (OLAP_ERR_DATA_EOF == res) {
+                _eof = true;
+            }
+            return res;
+        }
 
-        if (OLAP_SUCCESS == res) {
-            if (false == _value_present) {
-                size_t length = sizeof(_value);
-                res = _data_stream->read(reinterpret_cast(&_value), &length);
-            } else {
-                _value = 0.0;
+        bool* is_null = column_vector->is_null();
+        column_vector->set_col_data(_values);
+        size_t length = sizeof(FLOAT_TYPE);
+        if (column_vector->no_nulls()) {
+            for (uint32_t i = 0; i < size; ++i) {
+                FLOAT_TYPE value = 0.0;
+                res = _data_stream->read(reinterpret_cast(&value), &length);
+                if (OLAP_SUCCESS != res) {
+                    break;
+                }
+                _values[i] = value;
+            }
+        } else {
+            for (uint32_t i = 0; i < size; ++i) {
+                FLOAT_TYPE value = 0.0;
+                if (!is_null[i]) {
+                    res = _data_stream->read(reinterpret_cast(&value), &length);
+                    if (OLAP_SUCCESS != res) {
+                        break;
+                    }
+                }
+                _values[i] = value;
             }
         }
+        _stats->bytes_read += sizeof(FLOAT_TYPE) * size;
 
         if (OLAP_ERR_DATA_EOF == res) {
             _eof = true;
@@ -823,86 +851,33 @@ public:
         return res;
     }
 
-    virtual OLAPStatus next_vector(
-            uint8_t* batch_buf,
-            uint32_t batch_buf_len,
-            uint32_t start_row_in_block,
-            uint32_t batch_size,
-            std::vector& offset) {
-        char* return_value = reinterpret_cast(batch_buf + offset.front());
-
-        for (uint32_t i = start_row_in_block; i < batch_size; i++) {
-            OLAPStatus res = ColumnReader::next();
-
-            if (OLAP_SUCCESS == res) {
-                if (false == _value_present) {
-                    size_t length = sizeof(FLOAT_TYPE);
-                    res = _data_stream->read(return_value, &length);
-                } else {
-                    *reinterpret_cast(return_value) = 0.0;
-                }
-
-                return_value += sizeof(FLOAT_TYPE);
-            }
-
-            if (OLAP_ERR_DATA_EOF == res) {
-                _eof = true;
-            }
-        }
-
-        return OLAP_SUCCESS;
-    }
-
 protected:
     bool _eof;
     ReadOnlyFileStream* _data_stream;
-    FLOAT_TYPE _value;
+    FLOAT_TYPE* _values;
 };
 
 class DecimalColumnReader : public ColumnReader {
 public:
     DecimalColumnReader(uint32_t column_id, uint32_t column_unique_id);
     virtual ~DecimalColumnReader();
-    virtual OLAPStatus init(std::map* streams);
-    virtual OLAPStatus attach(RowCursor* cursor);
+    OLAPStatus init(std::map* streams,
+                    int size, MemPool* mem_pool,
+                    OlapReaderStatistics* stats) override;
     virtual OLAPStatus seek(PositionProvider* positions);
     virtual OLAPStatus skip(uint64_t row_count);
-    virtual OLAPStatus next();
     virtual OLAPStatus next_vector(
-            uint8_t* batch_buf,
-            uint32_t batch_buf_len,
-            uint32_t start_row_in_block,
-            uint32_t batch_size,
-            std::vector& offset) {
-        OLAPStatus res = OLAP_SUCCESS;
-        DecimalBuf* return_value =
-            reinterpret_cast(batch_buf + offset[0]);
-
-        for (uint32_t i = start_row_in_block; i < batch_size; i++) {
-            res = this->next();
-
-            if (OLAP_SUCCESS != res) {
-                OLAP_LOG_WARNING("fail to get next.[res=%d]", res);
-                return res;
-            }
-
-            *return_value = _value;
-            return_value++;
-        }
-
-        return OLAP_SUCCESS;
-    }
+            ColumnVector* column_vector,
+            uint32_t size,
+            MemPool* mem_pool);
 
     virtual size_t get_buffer_size() {
         return sizeof(RunLengthByteReader) * 2;
     }
 
 private:
-    struct DecimalBuf {
-        int64_t _int;
-        int32_t _frac;     // 最大64K
-    } __attribute__((packed));
-    DecimalBuf _value;
+    bool _eof;
+    decimal12_t* _values;
     RunLengthIntegerReader* _int_reader;
     RunLengthIntegerReader* _frac_reader;
 };
@@ -911,42 +886,23 @@ class LargeIntColumnReader : public ColumnReader {
 public:
     LargeIntColumnReader(uint32_t column_id, uint32_t column_unique_id);
     virtual ~LargeIntColumnReader();
-    virtual OLAPStatus init(std::map* streams);
-    virtual OLAPStatus attach(RowCursor* cursor);
+    virtual OLAPStatus init(std::map* streams,
+                            int size, MemPool* mem_pool,
+                            OlapReaderStatistics* stats);
     virtual OLAPStatus seek(PositionProvider* positions);
     virtual OLAPStatus skip(uint64_t row_count);
-    virtual OLAPStatus next();
     virtual OLAPStatus next_vector(
-            uint8_t* batch_buf,
-            uint32_t batch_buf_len,
-            uint32_t start_row_in_block,
-            uint32_t batch_size,
-            std::vector& offset) {
-        OLAPStatus res = OLAP_SUCCESS;
-        int128_t* return_value =
-            reinterpret_cast(batch_buf + offset[0]);
-
-        for (uint32_t i = start_row_in_block; i < batch_size; i++) {
-            res = this->next();
-
-            if (OLAP_SUCCESS != res) {
-                OLAP_LOG_WARNING("fail to get next.[res=%d]", res);
-                return res;
-            }
-
-            *return_value = _value;
-            ++return_value;
-        }
-
-        return OLAP_SUCCESS;
-    }
+            ColumnVector* column_vector,
+            uint32_t size,
+            MemPool* mem_pool);
 
     virtual size_t get_buffer_size() {
         return sizeof(RunLengthByteReader) * 2;
     }
 
 private:
-    int128_t _value;
+    bool _eof;
+    int128_t* _values;
     RunLengthIntegerReader* _high_reader;
     RunLengthIntegerReader* _low_reader;
 };
diff --git a/be/src/olap/column_file/column_writer.cpp b/be/src/olap/column_file/column_writer.cpp
index 3fb8c54909..829ff846de 100755
--- a/be/src/olap/column_file/column_writer.cpp
+++ b/be/src/olap/column_file/column_writer.cpp
@@ -251,18 +251,27 @@ OLAPStatus ColumnWriter::write(RowCursor* row_cursor) {
     OLAPStatus res = OLAP_SUCCESS;
 
     const Field* field = row_cursor->get_field_by_index(_column_id);
-
+    bool is_null = row_cursor->is_null(_column_id);
+    char* buf = field->get_ptr(row_cursor->get_buf());
     if (_is_present) {
-        res = _is_present->write(field->is_null());
+        res = _is_present->write(is_null);
 
-        if (field->is_null()) {
+        if (is_null) {
             _is_found_nulls = true;
         }
     }
 
     if (is_bf_column()) {
-        if (false == field->is_null()) {
-            _bf->add_bytes(field->buf(), field->size());
+        if (!is_null) {
+            if (_field_info.type == OLAP_FIELD_TYPE_CHAR ||
+                _field_info.type == OLAP_FIELD_TYPE_VARCHAR ||
+                _field_info.type == OLAP_FIELD_TYPE_HLL)
+            {
+                StringSlice* slice = reinterpret_cast(buf);
+                _bf->add_bytes(slice->data, slice->size);
+            } else {
+                _bf->add_bytes(buf, field->size());
+            }
         } else {
             _bf->add_bytes(NULL, 0);
         }
@@ -497,12 +506,12 @@ OLAPStatus ByteColumnWriter::write(RowCursor* row_cursor) {
 
     const Field* field = row_cursor->get_field_by_index(column_id());
 
-    if (false == field->is_null()) {
-        char value = *reinterpret_cast(field->buf());
-        _block_statistics.add(field);
+    bool is_null = row_cursor->is_null(column_id());
+    char* buf = field->get_field_ptr(row_cursor->get_buf());
+    _block_statistics.add(buf);
+    if (!is_null) {
+        char value = *reinterpret_cast(buf + 1);
         return _writer->write(value);
-    } else {
-        _block_statistics.add(field);
     }
 
     return OLAP_SUCCESS;
@@ -529,14 +538,6 @@ void ByteColumnWriter::record_position() {
     _writer->get_position(index_entry());
 }
 
-ColumnStatistics* ByteColumnWriter::segment_statistics() {
-    return &_segment_statistics;
-}
-
-ColumnStatistics* ByteColumnWriter::block_statistics() {
-    return &_block_statistics;
-}
-
 ////////////////////////////////////////////////////////////////////////////////
 
 IntegerColumnWriter::IntegerColumnWriter(
@@ -650,12 +651,12 @@ OLAPStatus VarStringColumnWriter::write(RowCursor* row_cursor) {
     }
 
     const Field* field = row_cursor->get_field_by_index(column_id());
+    bool is_null = row_cursor->is_null(column_id());
+    char* buf = field->get_ptr(row_cursor->get_buf());
 
-    if (false == field->is_null()) {
-        VarCharField::LengthValueType* length;
-        length = reinterpret_cast(field->buf());
-        const char* str = reinterpret_cast(length) + sizeof(*length);
-        return write(str, *length);
+    if (!is_null) {
+        StringSlice* slice = reinterpret_cast(buf);
+        return write(slice->data, slice->size);
     }
 
     return OLAP_SUCCESS;
@@ -664,15 +665,14 @@ OLAPStatus VarStringColumnWriter::write(RowCursor* row_cursor) {
 OLAPStatus VarStringColumnWriter::write(const char* str, uint32_t len) {
     OLAPStatus res = OLAP_SUCCESS;
     // zdb shield the dictionary coding
-    std::string key(str, len);
+    //std::string key(str, len);
 
-    if (OLAP_SUCCESS != (res = _data_stream->write(key.c_str(),
-                               key.length()))) {
+    if (OLAP_SUCCESS != (res = _data_stream->write(str, len))) {
         OLAP_LOG_WARNING("fail to write string content.");
         return res;
     }
 
-    if (OLAP_SUCCESS != (res = _length_writer->write(key.length()))) {
+    if (OLAP_SUCCESS != (res = _length_writer->write(len))) {
         OLAP_LOG_WARNING("fail to write string length.");
         return res;
     }
@@ -834,14 +834,6 @@ OLAPStatus VarStringColumnWriter::finalize(ColumnDataHeaderMessage* header) {
     return OLAP_SUCCESS;
 }
 
-ColumnStatistics* VarStringColumnWriter::segment_statistics() {
-    return &_segment_statistics;
-}
-
-ColumnStatistics* VarStringColumnWriter::block_statistics() {
-    return &_block_statistics;
-}
-
 void VarStringColumnWriter::save_encoding(ColumnEncodingMessage* encoding) {
     if (_use_dictionary_encoding) {
         encoding->set_kind(ColumnEncodingMessage::DICTIONARY);
@@ -884,10 +876,13 @@ OLAPStatus FixLengthStringColumnWriter::write(RowCursor* row_cursor) {
     }
 
     const Field* field = row_cursor->get_field_by_index(column_id());
+    bool is_null = row_cursor->is_null(column_id());
+    char* buf = field->get_ptr(row_cursor->get_buf());
 
-    if (false == field->is_null()) {
-        const char* str = reinterpret_cast(field->buf());
-        return VarStringColumnWriter::write(str, _length);
+    if (!is_null) {
+        //const char* str = reinterpret_cast(buf);
+        StringSlice* slice = reinterpret_cast(buf);
+        return VarStringColumnWriter::write(slice->data, slice->size);
     }
 
     return OLAP_SUCCESS;
@@ -948,10 +943,11 @@ OLAPStatus DecimalColumnWriter::write(RowCursor* row_cursor) {
     }
 
     const Field* field = row_cursor->get_field_by_index(column_id());
-
-    if (false == field->is_null()) {
-        decimal12_t value = *reinterpret_cast(field->buf());
-        _block_statistics.add(field);
+    bool is_null = row_cursor->is_null(column_id());
+    char* buf = field->get_field_ptr(row_cursor->get_buf());
+    _block_statistics.add(buf);
+    if (!is_null) {
+        decimal12_t value = *reinterpret_cast(buf + 1);
 
         res = _int_writer->write(value.integer);
         if (OLAP_SUCCESS != res) {
@@ -964,8 +960,6 @@ OLAPStatus DecimalColumnWriter::write(RowCursor* row_cursor) {
             OLAP_LOG_WARNING("fail to write fraction of Decimal.");
             return res;
         }
-    } else {
-        _block_statistics.add(field);
     }
 
     return OLAP_SUCCESS;
@@ -1001,14 +995,6 @@ void DecimalColumnWriter::record_position() {
     _frac_writer->get_position(index_entry(), false);
 }
 
-ColumnStatistics* DecimalColumnWriter::segment_statistics() {
-    return &_segment_statistics;
-}
-
-ColumnStatistics* DecimalColumnWriter::block_statistics() {
-    return &_block_statistics;
-}
-
 ////////////////////////////////////////////////////////////////////////////////
 
 LargeIntColumnWriter::LargeIntColumnWriter(uint32_t column_id,
@@ -1064,11 +1050,12 @@ OLAPStatus LargeIntColumnWriter::write(RowCursor* row_cursor) {
     }
 
     const Field* field = row_cursor->get_field_by_index(column_id());
+    bool is_null = row_cursor->is_null(column_id());
+    char* buf = field->get_field_ptr(row_cursor->get_buf());
+    _block_statistics.add(buf);
+    if (!is_null) {
 
-    if (false == field->is_null()) {
-        _block_statistics.add(field);
-
-        int64_t* value = reinterpret_cast(field->buf());
+        int64_t* value = reinterpret_cast(buf + 1);
         res = _high_writer->write(*value);
         if (OLAP_SUCCESS != res) {
             OLAP_LOG_WARNING("fail to write integer of LargeInt.");
@@ -1080,8 +1067,6 @@ OLAPStatus LargeIntColumnWriter::write(RowCursor* row_cursor) {
             OLAP_LOG_WARNING("fail to write fraction of LargeInt.");
             return res;
         }
-    } else {
-        _block_statistics.add(field);
     }
 
     return OLAP_SUCCESS;
@@ -1117,13 +1102,5 @@ void LargeIntColumnWriter::record_position() {
     _low_writer->get_position(index_entry(), false);
 }
 
-ColumnStatistics* LargeIntColumnWriter::segment_statistics() {
-    return &_segment_statistics;
-}
-
-ColumnStatistics* LargeIntColumnWriter::block_statistics() {
-    return &_block_statistics;
-}
-
 }  // namespace column_file
 }  // namespace palo
diff --git a/be/src/olap/column_file/column_writer.h b/be/src/olap/column_file/column_writer.h
index 7e8ece7416..a07373ed55 100644
--- a/be/src/olap/column_file/column_writer.h
+++ b/be/src/olap/column_file/column_writer.h
@@ -68,9 +68,6 @@ public:
     //   * column_statistics
     virtual OLAPStatus finalize(ColumnDataHeaderMessage* header);
     virtual void save_encoding(ColumnEncodingMessage* encoding);
-    // å­ç±»è¿”回统计信æ¯çš„æŽ¥å£
-    virtual ColumnStatistics* segment_statistics() = 0;
-    virtual ColumnStatistics* block_statistics() = 0;
     uint32_t column_id() const {
         return _column_id;
     }
@@ -82,6 +79,14 @@ public:
     virtual void get_bloom_filter_info(bool* has_bf_column,
                                        uint32_t* bf_hash_function_num,
                                        uint32_t* bf_bit_num);
+
+    ColumnStatistics* segment_statistics() {
+        return &_segment_statistics;
+    }
+
+    ColumnStatistics* block_statistics() {
+        return &_block_statistics;
+    }
 protected:
     ColumnWriter(uint32_t column_id,
             OutStreamFactory* stream_factory,
@@ -141,8 +146,6 @@ public:
     virtual OLAPStatus write(RowCursor* row_cursor);
     virtual OLAPStatus finalize(ColumnDataHeaderMessage* header);
     virtual void record_position();
-    virtual ColumnStatistics* segment_statistics();
-    virtual ColumnStatistics* block_statistics();
     virtual OLAPStatus flush() {
         return OLAP_SUCCESS;
     }
@@ -220,12 +223,11 @@ public:
 
         bool is_null = row_cursor->is_null(column_id());
         const Field* field = row_cursor->get_field_by_index(column_id());
-        if (false == is_null) {
-            _block_statistics.add(field);
-            T value = *reinterpret_cast(field->buf());
+        char* buf = field->get_field_ptr(row_cursor->get_buf());
+        _block_statistics.add(buf);
+        if (!is_null) {
+            T value = *reinterpret_cast(buf + 1);
             return _writer.write(static_cast(value));
-        } else {
-            _block_statistics.add(field);
         }
 
         return OLAP_SUCCESS;
@@ -264,14 +266,6 @@ public:
         ColumnWriter::record_position();
         _writer.record_position(index_entry());
     }
-
-    virtual ColumnStatistics* segment_statistics() {
-        return &_segment_statistics;
-    }
-
-    virtual ColumnStatistics* block_statistics() {
-        return &_block_statistics;
-    }
 private:
     IntegerColumnWriter _writer;
 
@@ -327,9 +321,10 @@ public:
 
         bool is_null = row_cursor->is_null(column_id());
         const Field* field = row_cursor->get_field_by_index(column_id());
-        if (false == is_null) {
-            T* value = reinterpret_cast(field->buf());
-            _block_statistics.add(field);
+        char* buf = field->get_field_ptr(row_cursor->get_buf());
+        _block_statistics.add(buf);
+        if (!is_null) {
+            T* value = reinterpret_cast(buf + 1);
             return _stream->write(reinterpret_cast(value), sizeof(T));
         }
 
@@ -359,13 +354,6 @@ public:
         _stream->get_position(index_entry());
     }
 
-    virtual ColumnStatistics* segment_statistics() {
-        return &_segment_statistics;
-    }
-
-    virtual ColumnStatistics* block_statistics() {
-        return &_block_statistics;
-    }
 private:
     OutStream* _stream;
 
@@ -387,8 +375,6 @@ public:
     virtual OLAPStatus write(RowCursor* row_cursor);
     virtual uint64_t estimate_buffered_memory();
     virtual OLAPStatus finalize(ColumnDataHeaderMessage* header);
-    virtual ColumnStatistics* segment_statistics();
-    virtual ColumnStatistics* block_statistics();
     virtual void save_encoding(ColumnEncodingMessage* encoding);
     virtual void record_position();
     virtual OLAPStatus flush() {
@@ -468,8 +454,6 @@ public:
     virtual OLAPStatus write(RowCursor* row_cursor);
     virtual OLAPStatus finalize(ColumnDataHeaderMessage* header);
     virtual void record_position();
-    virtual ColumnStatistics* segment_statistics();
-    virtual ColumnStatistics* block_statistics();
     virtual OLAPStatus flush() {
         return OLAP_SUCCESS;
     }
@@ -490,8 +474,6 @@ public:
     virtual OLAPStatus write(RowCursor* row_cursor);
     virtual OLAPStatus finalize(ColumnDataHeaderMessage* header);
     virtual void record_position();
-    virtual ColumnStatistics* segment_statistics();
-    virtual ColumnStatistics* block_statistics();
     virtual OLAPStatus flush() {
         return OLAP_SUCCESS;
     }
diff --git a/be/src/olap/column_file/data_writer.cpp b/be/src/olap/column_file/data_writer.cpp
index 4d7e87dc7c..3a7113436a 100644
--- a/be/src/olap/column_file/data_writer.cpp
+++ b/be/src/olap/column_file/data_writer.cpp
@@ -101,12 +101,7 @@ OLAPStatus ColumnDataWriter::attached_by(RowCursor* row_cursor) {
             return OLAP_ERR_OTHER_ERROR;
         }
     }
-
-    if (OLAP_SUCCESS != _row_block->get_row_to_write(_row_index, row_cursor)) {
-        OLAP_LOG_WARNING("fail to get row in row_block.");
-        return OLAP_ERR_OTHER_ERROR;
-    }
-
+    _row_block->get_row(_row_index, row_cursor);
     return OLAP_SUCCESS;
 }
 
@@ -159,7 +154,7 @@ OLAPStatus ColumnDataWriter::_add_segment() {
                 config::push_write_mbytes_per_sec);
     } else {
         res = _segment_writer->init(
-                config::base_expansion_write_mbytes_per_sec);
+                config::base_compaction_write_mbytes_per_sec);
     }
 
     if (OLAP_SUCCESS != res) {
@@ -195,12 +190,7 @@ OLAPStatus ColumnDataWriter::_flush_row_block(RowBlock* row_block, bool is_final
 
     // 目标是将自己的block按æ¡å†™å…¥ç›®æ ‡block中。
     for (uint32_t i = 0; i < row_block->row_block_info().row_num; i++) {
-        res = row_block->get_row_to_read(i, &_cursor);
-        if (OLAP_SUCCESS != res) {
-            OLAP_LOG_WARNING("fail to get row from row block. [res=%d]", res);
-            return OLAP_ERR_WRITER_DATA_WRITE_ERROR;
-        }
-
+        row_block->get_row(i, &_cursor);
         res = _segment_writer->write(&_cursor);
         if (OLAP_SUCCESS != res) {
             OLAP_LOG_WARNING("fail to write row to segment. [res=%d]", res);
@@ -208,13 +198,6 @@ OLAPStatus ColumnDataWriter::_flush_row_block(RowBlock* row_block, bool is_final
         }
     }
 
-    /*
-    if (OLAP_SUCCESS != (res = _segment_writer->create_row_index_entry())) {
-        OLAP_LOG_WARNING("fail to record block position. [res=%d]", res);
-        return OLAP_ERR_WRITER_INDEX_WRITE_ERROR;
-    }
-    */
-
     // 在OLAPIndexä¸­è®°å½•çš„ä¸æ˜¯æ•°æ®æ–‡ä»¶çš„åç§»,而是block的编å·
     if (OLAP_SUCCESS != _index->add_row_block(*row_block, _block_id++)) {
         OLAP_LOG_WARNING("fail to update index.");
@@ -244,7 +227,6 @@ OLAPStatus ColumnDataWriter::_flush_row_block(RowBlock* row_block, bool is_final
         _num_rows = 0;
     }
 
-    row_block->reset_block();
     return OLAP_SUCCESS;
 }
 
@@ -296,6 +278,10 @@ uint64_t ColumnDataWriter::written_bytes() {
     return size;
 }
 
+MemPool* ColumnDataWriter::mem_pool() {
+    return _row_block->mem_pool();
+}
+
 }  // namespace column_file
 }  // namespace palo
 
diff --git a/be/src/olap/column_file/data_writer.h b/be/src/olap/column_file/data_writer.h
index 9bb074f438..210d50d040 100644
--- a/be/src/olap/column_file/data_writer.h
+++ b/be/src/olap/column_file/data_writer.h
@@ -34,6 +34,7 @@ public:
     virtual OLAPStatus finalize();
     virtual OLAPStatus write_row_block(RowBlock* row_block);
     virtual uint64_t written_bytes();
+    virtual MemPool* mem_pool();
 private:
     OLAPStatus _add_segment();
     OLAPStatus _finalize_segment();
diff --git a/be/src/olap/column_file/file_stream.cpp b/be/src/olap/column_file/file_stream.cpp
index 00063b46a8..54b85d79dd 100755
--- a/be/src/olap/column_file/file_stream.cpp
+++ b/be/src/olap/column_file/file_stream.cpp
@@ -25,14 +25,16 @@ ReadOnlyFileStream::ReadOnlyFileStream(
         FileHandler* handler,
         ByteBuffer** shared_buffer,
         Decompressor decompressor,
-        uint32_t compress_buffer_size) :
-        _file_cursor(handler, 0, 0),
-        _compressed_helper(NULL),
-        _uncompressed(NULL),
-        _shared_buffer(shared_buffer),
-        _decompressor(decompressor),
-        _compress_buffer_size(compress_buffer_size + sizeof(StreamHead)),
-        _current_compress_position(std::numeric_limits::max()) {
+        uint32_t compress_buffer_size,
+        OlapReaderStatistics* stats)
+            : _file_cursor(handler, 0, 0),
+            _compressed_helper(NULL),
+            _uncompressed(NULL),
+            _shared_buffer(shared_buffer),
+            _decompressor(decompressor),
+            _compress_buffer_size(compress_buffer_size + sizeof(StreamHead)),
+            _current_compress_position(std::numeric_limits::max()),
+            _stats(stats) {
 }
 
 ReadOnlyFileStream::ReadOnlyFileStream(
@@ -41,14 +43,16 @@ ReadOnlyFileStream::ReadOnlyFileStream(
         uint64_t offset,
         uint64_t length,
         Decompressor decompressor,
-        uint32_t compress_buffer_size) : 
-        _file_cursor(handler, offset, length),
-        _compressed_helper(NULL),
-        _uncompressed(NULL),
-        _shared_buffer(shared_buffer),
-        _decompressor(decompressor),
-        _compress_buffer_size(compress_buffer_size + sizeof(StreamHead)),
-        _current_compress_position(std::numeric_limits::max()) {
+        uint32_t compress_buffer_size,
+        OlapReaderStatistics* stats)
+            : _file_cursor(handler, offset, length),
+            _compressed_helper(NULL),
+            _uncompressed(NULL),
+            _shared_buffer(shared_buffer),
+            _decompressor(decompressor),
+            _compress_buffer_size(compress_buffer_size + sizeof(StreamHead)),
+            _current_compress_position(std::numeric_limits::max()),
+            _stats(stats) {
 }
 
 OLAPStatus ReadOnlyFileStream::_assure_data() {
@@ -63,18 +67,20 @@ OLAPStatus ReadOnlyFileStream::_assure_data() {
 
     StreamHead header;
     size_t file_cursor_used = _file_cursor.position();
-    OLAPStatus res = _file_cursor.read(reinterpret_cast(&header), sizeof(header));
-
-    if (OLAP_UNLIKELY(OLAP_SUCCESS != res)) {
-        OLAP_LOG_WARNING("read header fail");
-        return res;
-    }
-
-    res = _fill_compressed(header.length);
-
-    if (OLAP_UNLIKELY(OLAP_SUCCESS != res)) {
-        OLAP_LOG_WARNING("read header fail");
-        return res;
+    OLAPStatus res = OLAP_SUCCESS;
+    {
+        SCOPED_RAW_TIMER(&_stats->io_ns);
+        res = _file_cursor.read(reinterpret_cast(&header), sizeof(header));
+        if (OLAP_UNLIKELY(OLAP_SUCCESS != res)) {
+            OLAP_LOG_WARNING("read header fail");
+            return res;
+        }
+        res = _fill_compressed(header.length);
+        if (OLAP_UNLIKELY(OLAP_SUCCESS != res)) {
+            OLAP_LOG_WARNING("read header fail");
+            return res;
+        }
+        _stats->compressed_bytes_read += sizeof(header) + header.length;
     }
 
     if (header.type == StreamHead::UNCOMPRESSED) {
@@ -84,12 +90,14 @@ OLAPStatus ReadOnlyFileStream::_assure_data() {
     } else {
         _compressed_helper->set_position(0);
         _compressed_helper->set_limit(_compress_buffer_size);
-
-        res = _decompressor(*_shared_buffer, _compressed_helper);
-
-        if (OLAP_SUCCESS != res) {
-            OLAP_LOG_WARNING("fail to decompress err=%d", res);
-            return res;
+        {
+            SCOPED_RAW_TIMER(&_stats->decompress_ns);
+            res = _decompressor(*_shared_buffer, _compressed_helper);
+            if (OLAP_SUCCESS != res) {
+                OLAP_LOG_WARNING("fail to decompress err=%d", res);
+                return res;
+            }
+            _stats->uncompressed_bytes_read += _compressed_helper->limit();
         }
     }
 
@@ -173,7 +181,6 @@ OLAPStatus ReadOnlyFileStream::_fill_compressed(size_t length) {
     }
 
     OLAPStatus res = _file_cursor.read((*_shared_buffer)->array(), length);
-
     if (OLAP_SUCCESS != res) {
         OLAP_LOG_WARNING("fail to fill compressed buffer.");
         return res;
diff --git a/be/src/olap/column_file/file_stream.h b/be/src/olap/column_file/file_stream.h
index 4875f0e53d..1d318d901b 100755
--- a/be/src/olap/column_file/file_stream.h
+++ b/be/src/olap/column_file/file_stream.h
@@ -28,6 +28,7 @@
 #include "olap/column_file/stream_index_reader.h"
 #include "olap/file_helper.h"
 #include "olap/olap_common.h"
+#include "util/runtime_profile.h"
 
 namespace palo {
 namespace column_file {
@@ -53,14 +54,16 @@ public:
     ReadOnlyFileStream(FileHandler* handler,
             ByteBuffer** shared_buffer,
             Decompressor decompressor,
-            uint32_t compress_buffer_size);
+            uint32_t compress_buffer_size,
+            OlapReaderStatistics* stats);
 
     ReadOnlyFileStream(FileHandler* handler,
             ByteBuffer** shared_buffer,
             uint64_t offset,
             uint64_t length,
             Decompressor decompressor,
-            uint32_t compress_buffer_size);
+            uint32_t compress_buffer_size,
+            OlapReaderStatistics* stats);
 
     ~ReadOnlyFileStream() {
         SAFE_DELETE(_compressed_helper);
@@ -119,6 +122,31 @@ public:
         return _compress_buffer_size;
     }
 
+    inline void get_buf(char** buf, uint32_t* remaining_bytes) {
+        if (UNLIKELY(_uncompressed == NULL)) {
+            *buf = NULL;
+            *remaining_bytes = 0;
+        } else {
+            *buf = _uncompressed->array();
+            *remaining_bytes = _uncompressed->remaining();
+        }
+    }
+
+    inline void get_position(uint32_t* position) {
+        *position = _uncompressed->position();
+    }
+
+    inline void set_position(uint32_t pos) {
+        _uncompressed->set_position(pos);
+    }
+
+    inline int remaining() {
+        if (_uncompressed == NULL) {
+            return 0;
+        }
+        return _uncompressed->remaining();
+    }
+
 private:
     // Use to read a specified range in file
     class FileCursor {
@@ -189,6 +217,11 @@ private:
             _used = offset;
             return OLAP_SUCCESS;
         }
+
+        const std::string& file_name() const { return _file_handler->file_name(); }
+
+        size_t offset() const { return _offset; }
+
     private:
         FileHandler* _file_handler;
         size_t _offset; // start from where
@@ -208,6 +241,8 @@ private:
     size_t _compress_buffer_size;
     size_t _current_compress_position;
 
+    OlapReaderStatistics* _stats;
+
     DISALLOW_COPY_AND_ASSIGN(ReadOnlyFileStream);
 };
 
diff --git a/be/src/olap/column_file/run_length_integer_reader.cpp b/be/src/olap/column_file/run_length_integer_reader.cpp
index 201afd8f03..dfadb9f3cb 100644
--- a/be/src/olap/column_file/run_length_integer_reader.cpp
+++ b/be/src/olap/column_file/run_length_integer_reader.cpp
@@ -22,11 +22,12 @@
 namespace palo {
 namespace column_file {
 
-RunLengthIntegerReader::RunLengthIntegerReader(ReadOnlyFileStream* input, bool is_singed) : 
-        _input(input),
+RunLengthIntegerReader::RunLengthIntegerReader(ReadOnlyFileStream* input, bool is_singed)
+      : _input(input),
         _signed(is_singed),
         _num_literals(0),
-        _used(0) {}
+        _used(0) {
+}
 
 OLAPStatus RunLengthIntegerReader::_read_values() {
     OLAPStatus res = OLAP_SUCCESS;
diff --git a/be/src/olap/column_file/run_length_integer_reader.h b/be/src/olap/column_file/run_length_integer_reader.h
index 461160517b..191f6b9f8b 100644
--- a/be/src/olap/column_file/run_length_integer_reader.h
+++ b/be/src/olap/column_file/run_length_integer_reader.h
@@ -20,6 +20,7 @@
 #include "olap/column_file/run_length_integer_writer.h"
 #include "olap/column_file/stream_index_reader.h"
 #include "olap/olap_define.h"
+#include "util/runtime_profile.h"
 
 namespace palo {
 namespace column_file {
diff --git a/be/src/olap/column_file/segment_reader.cpp b/be/src/olap/column_file/segment_reader.cpp
index 33ee771176..d821453688 100644
--- a/be/src/olap/column_file/segment_reader.cpp
+++ b/be/src/olap/column_file/segment_reader.cpp
@@ -22,6 +22,8 @@
 #include "olap/column_file/file_stream.h"
 #include "olap/column_file/in_stream.h"
 #include "olap/column_file/out_stream.h"
+#include "olap/olap_cond.h"
+#include "olap/row_block.h"
 
 namespace palo {
 namespace column_file {
@@ -33,12 +35,14 @@ SegmentReader::SegmentReader(
         OLAPTable* table,
         OLAPIndex* index,
         uint32_t segment_id,
-        const std::vector& return_columns,
+        const std::vector& used_columns,
         const std::set& load_bf_columns,
         const Conditions* conditions,
+        const std::vector* col_predicates,
         const DeleteHandler& delete_handler,
         const DelCondSatisfied delete_status,
-        RuntimeState* runtime_state) :
+        RuntimeState* runtime_state,
+        OlapReaderStatistics* stats) :
         _file_name(file),
         _table(table),
         _olap_index(index),
@@ -47,66 +51,58 @@ SegmentReader::SegmentReader(
         _delete_handler(delete_handler),
         _delete_status(delete_status),
         _eof(false),
-        _is_init(false),
         _end_block(-1),
         // ç¡®ä¿ç¬¬ä¸€æ¬¡è°ƒç”¨_move_to_next_row,会执行seek_to_block
-        _current_block(-2),
         _block_count(0),
         _num_rows_in_block(0),
         _null_supported(false),
-        _return_columns(return_columns),
+        _used_columns(used_columns),
         _load_bf_columns(load_bf_columns),
         _mmap_buffer(NULL),
         _include_blocks(NULL),
-        _filted_rows(0),
         _is_using_mmap(false),
         _is_data_loaded(false),
         _buffer_size(0),
         _lru_cache(NULL),
-        _cache_handle(NULL),
-        _vectorized_info_inited(false),
         _runtime_state(runtime_state),
-        _shared_buffer(NULL) {
+        _shared_buffer(NULL),
+        _stats(stats) {
     _lru_cache = OLAPEngine::get_instance()->index_stream_lru_cache();
+    _tracker.reset(new MemTracker(-1));
+    _mem_pool.reset(new MemPool(_tracker.get()));
 }
 
 SegmentReader::~SegmentReader() {
     SAFE_DELETE(_shared_buffer);
     SAFE_DELETE_ARRAY(_include_blocks);
 
-    std::map::iterator index_it = _indices.begin();
-    for (; index_it != _indices.end(); ++index_it) {
-        SAFE_DELETE((*index_it).second);
+    for (auto& index_it : _indices) {
+        SAFE_DELETE(index_it.second);
     }
 
-    std::map::iterator bf_it = _bloom_filters.begin();
-    for (; bf_it != _bloom_filters.end(); ++bf_it) {
-        SAFE_DELETE(bf_it->second);
+    for (auto& bf_it : _bloom_filters) {
+        SAFE_DELETE(bf_it.second);
     }
 
-    for (int32_t i = 0; i < _get_included_row_index_stream_num(); i++) {
-        if (NULL != _cache_handle[i]) {
-            _lru_cache->release(_cache_handle[i]);
+    for (auto handle : _cache_handle) {
+        if (handle != nullptr) {
+            _lru_cache->release(handle);
         }
     }
 
-    SAFE_DELETE_ARRAY(_cache_handle);
-    _cache_handle = NULL;
     _lru_cache = NULL;
     _file_handler.close();
 
-    if (_runtime_state != NULL) {
+    if (_is_data_loaded && _runtime_state != NULL) {
         MemTracker::update_limits(_buffer_size * -1, _runtime_state->mem_trackers()); 
     }
 
-    std::map::iterator stream_it = _streams.begin();
-    for (; stream_it != _streams.end(); ++stream_it) {
-        delete(*stream_it).second;
+    for (auto& it : _streams) {
+        delete it.second;
     }
 
-    std::vector::iterator reader_it = _column_readers.begin();
-    for (; reader_it != _column_readers.end(); ++reader_it) {
-        delete(*reader_it);
+    for (auto reader : _column_readers) {
+        delete reader;
     }
 
     if (_is_using_mmap) {
@@ -170,87 +166,39 @@ OLAPStatus SegmentReader::_set_decompressor() {
         _decompressor = NULL;
         break;
     }
-
     case COMPRESS_LZO: {
         _decompressor = lzo_decompress;
         break;
     }
-
     case COMPRESS_LZ4: {
         _decompressor = lz4_decompress;
         break;
     }
-
     default: {
         OLAP_LOG_WARNING("unknown decompressor");
         return OLAP_ERR_PARSE_PROTOBUF_ERROR;
     }
     }
-
     return OLAP_SUCCESS;
 }
 
 OLAPStatus SegmentReader::_set_segment_info() {
     _num_rows_in_block = _header_message().num_rows_per_block();
-
     if (_num_rows_in_block == 0) {
         _num_rows_in_block = _table->num_rows_per_row_block();
     }
 
     _set_column_map();
     OLAPStatus res = _set_decompressor();
-
     if (OLAP_SUCCESS != res) {
         OLAP_LOG_WARNING("fail to get decompressor.");
         return res;
     }
-
     return OLAP_SUCCESS;
 }
 
-void SegmentReader::_init_vectorized_info(std::vector& return_columns) {
-    if (_vectorized_info_inited) {
-        return;
-    }
-
-    uint32_t offset = 0;
-    uint32_t width = 0;
-
-    for (size_t i = 0; i < return_columns.size(); ++i) {
-        if (_table->tablet_schema()[return_columns[i]].type == OLAP_FIELD_TYPE_VARCHAR
-                || _table->tablet_schema()[return_columns[i]].type == OLAP_FIELD_TYPE_HLL) {
-            width = sizeof(VarCharField::OffsetValueType);
-        } else {
-            width = _table->tablet_schema()[return_columns[i]].length;
-        }
-
-        VectorizedPositionInfo pos;
-        pos.column_id = return_columns[i];
-        pos.column_position = offset;
-        _vectorized_position.push_back(pos);
-        offset += width * _num_rows_in_block;
-    }
-
-    for (size_t i = 0; i < _vectorized_position.size(); i++) {
-        if (_table->tablet_schema()[_vectorized_position[i].column_id].type ==
-            OLAP_FIELD_TYPE_VARCHAR 
-            || _table->tablet_schema()[_vectorized_position[i].column_id].type ==
-            OLAP_FIELD_TYPE_HLL) {
-            _vectorized_position[i].offset_position = offset;
-            offset += _num_rows_in_block *
-                      _table->tablet_schema()[_vectorized_position[i].column_id].length;
-        }
-    }
-
-    _vectorized_info_inited = true;
-}
-
 OLAPStatus SegmentReader::init(bool is_using_cache) {
-    if (_is_init) {
-        return OLAP_SUCCESS;
-    }
-
-    OlapStopWatch timer;
+    SCOPED_RAW_TIMER(&_stats->index_load_ns);
 
     OLAPStatus res = OLAP_SUCCESS;
     res = _load_segment_file();
@@ -258,8 +206,6 @@ OLAPStatus SegmentReader::init(bool is_using_cache) {
         OLAP_LOG_WARNING("fail to load sgment file. ");
         return res;
     }
-
-    uint64_t load_segment_time_us = timer.get_elapse_time_us();
     // 文件头
     res = _set_segment_info();
     if (OLAP_SUCCESS != res) {
@@ -267,14 +213,8 @@ OLAPStatus SegmentReader::init(bool is_using_cache) {
         return res;
     }
 
-    res = _cursor.init(_table->tablet_schema());
-    if (OLAP_SUCCESS != res) {
-        OLAP_LOG_WARNING("fail to init row_cursor");
-        return res;
-    }
-
     _shared_buffer = ByteBuffer::create(
-            _header_message().stream_buffer_size() + sizeof(StreamHead));
+        _header_message().stream_buffer_size() + sizeof(StreamHead));
     if (_shared_buffer == NULL) {
         OLAP_LOG_WARNING("fail to create shared buffer. [size=%lu]", sizeof(ByteBuffer));
         return OLAP_ERR_MALLOC_ERROR;
@@ -286,39 +226,31 @@ OLAPStatus SegmentReader::init(bool is_using_cache) {
         return res;
     }
 
-    timer.reset();
     res = _load_index(is_using_cache);
     if (OLAP_SUCCESS != res) {
         OLAP_LOG_WARNING("fail to load index stream");
         return res;
     }
 
-    uint64_t load_index_stream_time_us = timer.get_elapse_time_us();
-    // record segment init step time when more than 100ms
-    if (load_segment_time_us > 100000 || load_index_stream_time_us > 100000) {
-        OLAP_LOG_WARNING("segment init cost too much time. "
-                         "[load_segment_time_us=%lu load_index_stream_time_us=%lu]",
-                         load_segment_time_us, load_index_stream_time_us);
-    }
-
     return OLAP_SUCCESS;
 }
 
 OLAPStatus SegmentReader::seek_to_block(
-        uint32_t first_block, uint32_t last_block, bool without_filter) {
+        uint32_t first_block, uint32_t last_block,
+        bool without_filter, uint32_t* next_block_id,
+        bool* eof) {
     OLAPStatus res = OLAP_SUCCESS;
 
     if (!_is_data_loaded) {
-
         _reset_readers();
         res = _read_all_data_streams(&_buffer_size);
-        if (OLAP_SUCCESS != res) {
+        if (res != OLAP_SUCCESS) {
             OLAP_LOG_WARNING("fail to read data stream");
             return res;
         }
     
         OLAPStatus res = _create_reader(&_buffer_size);
-        if (OLAP_SUCCESS != res) {
+        if (res != OLAP_SUCCESS) {
             OLAP_LOG_WARNING("fail to create reader");
             return res;
         }
@@ -331,12 +263,20 @@ OLAPStatus SegmentReader::seek_to_block(
         }
 
         _is_data_loaded = true;
-    } else {
-        _eof = false;
     }
 
+    // If seek to block position, all stat will reset to initial
+    _eof = false;
     _end_block = last_block >= _block_count ? _block_count - 1 : last_block;
-    if (!without_filter) {
+    _without_filter = without_filter;
+    delete[] _include_blocks;
+    _include_blocks = nullptr;
+    if (!_without_filter) {
+        /*
+         * row batch may be not empty before next read,
+         * should be clear here, otherwise dirty records
+         * will be read.
+         */
         _remain_block = last_block - first_block + 1;
         res = _pick_row_groups(first_block, last_block);
         if (OLAP_SUCCESS != res) {
@@ -344,55 +284,48 @@ OLAPStatus SegmentReader::seek_to_block(
             return res;
         }
     }
-
-    _current_row = first_block * _num_rows_in_block;
-    OLAP_LOG_DEBUG("first %u end %u; tol %u",
-        first_block, last_block,
-        _block_count);
-    res = _read_block(without_filter);
-
-    if (OLAP_SUCCESS != res) {
-        OLAP_LOG_WARNING("fail to read block");
-        return res;
-    }
+    _seek_to_block(first_block, without_filter);
+    *next_block_id = _next_block_id;
+    *eof = _eof;
 
     return OLAP_SUCCESS;
 }
 
-const RowCursor* SegmentReader::get_next_row(bool without_filter) {
-    RowCursor* ret = NULL;
-    while (true) {
-        OLAPStatus res = _move_to_next_row(without_filter);
-        if (OLAP_SUCCESS != res) {
-            ret = NULL;
-            break;
-        }
-
-        _cursor.reset_buf();
-        res = _read_next_and_attach();
-        if (OLAP_SUCCESS != res) {
-            OLAP_LOG_WARNING("cur %lu to %lu",
-                    _current_row, _header_message().number_of_rows());
-            ret = NULL;
-            break;
-        }
-
-        if (true == without_filter || NULL == _include_blocks
-            || DEL_NOT_SATISFIED == _include_blocks[_current_block]) {
-            ret = &_cursor;
-            break;
-        }
-
-        bool row_del_filter = _delete_handler.is_filter_data(
-                                    _olap_index->version().second, _cursor);
-        if (false == row_del_filter) {
-            ret = &_cursor;
-            break;
-        }
-
-        ++_filted_rows;
+OLAPStatus SegmentReader::get_block(
+        VectorizedRowBatch* batch, uint32_t* next_block_id, bool* eof) {
+    if (_eof) {
+        *eof = true;
+        return OLAP_SUCCESS;
     }
-    return ret;
+
+    // lazy seek
+    _seek_to_block_directly(_next_block_id, batch->columns());
+
+    int64_t num_rows_load = batch->limit();
+    if (OLAP_UNLIKELY(_current_block_id == _block_count - 1)) {
+        int64_t num_rows_left =
+            _header_message().number_of_rows() - _num_rows_in_block * _current_block_id;
+        num_rows_load = std::min(num_rows_load, num_rows_left);
+    }
+
+    auto res = _load_to_vectorized_row_batch(batch, num_rows_load);
+    if (res != OLAP_SUCCESS) {
+        OLAP_LOG_WARNING("fail to load block to vectorized_row_batch. [res=%d]", res);
+        return res;
+    }
+
+    DCHECK(num_rows_load == _num_rows_in_block || _next_block_id == _end_block)
+        << "num_rows_load must equal with _num_rows_in_block"
+        << ", num_rows_load:" << num_rows_load
+        << ", _num_rows_in_block:" << _num_rows_in_block
+        << ", _next_block_id" << _next_block_id
+        << ", _end_block" << _end_block;
+
+    _seek_to_block(_next_block_id + 1, _without_filter);
+
+    *next_block_id = _next_block_id;
+    *eof = _eof;
+    return OLAP_SUCCESS;
 }
 
 void SegmentReader::_set_column_map() {
@@ -415,7 +348,6 @@ void SegmentReader::_set_column_map() {
         ColumnId unique_column_id = _header_message().column(segment_column_id).unique_id();
         if (_unique_id_to_table_id_map.find(unique_column_id) != _unique_id_to_table_id_map.end()) {
             _unique_id_to_segment_id_map[unique_column_id] = segment_column_id;
-            _segment_id_to_unique_id_map[segment_column_id] = unique_column_id;
             // encoding 应该和segment schemaåºä¸€è‡´ã€‚
             _encodings_map[unique_column_id] =
                 _header_message().column_encoding(segment_column_id);
@@ -424,24 +356,7 @@ void SegmentReader::_set_column_map() {
 }
 
 OLAPStatus SegmentReader::_pick_columns() {
-    // _return_columns 使用tableçš„schemaåº
-    // 而return的时候则根æ®return columnsé¡ºåºæŽ’åˆ—
-    //
-    // _include_columns    setï¼Œé¡ºåºæ— å…³, 使用uid
-    // _indices            mapï¼Œé¡ºåºæ— å…³ï¼Œä½¿ç”¨uid
-    // _table_id_to_unique_id_map 是一个从table id转为uid的映射表
-
-    // 如果å‘现没给出return columns,就用全集
-    if (0 == _return_columns.size()) {
-        for (size_t i = 0; i < _table->tablet_schema().size(); ++i) {
-            _return_columns.push_back(i);
-        }
-    }
-
-    _include_columns.clear();
-    _include_bf_columns.clear();
-
-    for (uint32_t i : _return_columns) {
+    for (uint32_t i : _used_columns) {
         ColumnId unique_column_id = _table_id_to_unique_id_map[i];
         _include_columns.insert(unique_column_id);
     }
@@ -457,7 +372,7 @@ OLAPStatus SegmentReader::_pick_columns() {
 OLAPStatus SegmentReader::_pick_delete_row_groups(uint32_t first_block, uint32_t last_block) {
     OLAP_LOG_DEBUG("pick for %u to %u for delete_condition", first_block, last_block);
 
-    if (true == _delete_handler.empty()) {
+    if (_delete_handler.empty()) {
         return OLAP_SUCCESS;
     }
 
@@ -486,7 +401,8 @@ OLAPStatus SegmentReader::_pick_delete_row_groups(uint32_t first_block, uint32_t
                     continue;
                 }
                 StreamIndexReader* index_reader = _indices[unique_column_id];
-                int del_ret = i.second.del_eval(index_reader->entry(j).column_statistic());
+                int del_ret = i.second->del_eval(
+                    index_reader->entry(j).column_statistic().pair());
                 if (DEL_SATISFIED == del_ret) {
                     continue;
                 } else if (DEL_PARTIAL_SATISFIED == del_ret) {
@@ -513,9 +429,10 @@ OLAPStatus SegmentReader::_pick_delete_row_groups(uint32_t first_block, uint32_t
                 --_remain_block;
                 OLAP_LOG_DEBUG("filter block: %d", j);
                 if (j < _block_count - 1) {
-                    _filted_rows += _num_rows_in_block; 
+                    _stats->rows_del_filtered += _num_rows_in_block;
                 } else {
-                    _filted_rows += _header_message().number_of_rows() - j * _num_rows_in_block;
+                    _stats->rows_del_filtered +=
+                        _header_message().number_of_rows() - j * _num_rows_in_block;
                 }
             }
 
@@ -584,14 +501,16 @@ OLAPStatus SegmentReader::_pick_row_groups(uint32_t first_block, uint32_t last_b
                 continue;
             }
 
-            if (!i.second.eval(index_reader->entry(j).column_statistic())) {
+            if (!i.second->eval(index_reader->entry(j).column_statistic().pair())) {
                 _include_blocks[j] = DEL_SATISFIED;
                 --_remain_block;
-            }
-            if (j < _block_count - 1) {
-                _filted_rows += _num_rows_in_block; 
-            } else {
-                _filted_rows += _header_message().number_of_rows() - j * _num_rows_in_block;
+
+                if (j < _block_count - 1) {
+                    _stats->rows_stats_filtered += _num_rows_in_block; 
+                } else {
+                    _stats->rows_stats_filtered +=
+                        _header_message().number_of_rows() - j * _num_rows_in_block;
+                }
             }
         }
     }
@@ -623,13 +542,14 @@ OLAPStatus SegmentReader::_pick_row_groups(uint32_t first_block, uint32_t last_b
                 continue;
             }
 
-            if (!_conditions->columns().at(i).eval(bf_reader->entry(j))) {
+            if (!_conditions->columns().at(i)->eval(bf_reader->entry(j))) {
                 _include_blocks[j] = DEL_SATISFIED;
                 --_remain_block;
                 if (j < _block_count - 1) {
-                    _filted_rows += _num_rows_in_block; 
+                    _stats->rows_stats_filtered += _num_rows_in_block; 
                 } else {
-                    _filted_rows += _header_message().number_of_rows() - j * _num_rows_in_block;
+                    _stats->rows_stats_filtered +=
+                        _header_message().number_of_rows() - j * _num_rows_in_block;
                 }
             }
         }
@@ -665,16 +585,11 @@ OLAPStatus SegmentReader::_load_index(bool is_using_cache) {
     OLAPStatus res = OLAP_SUCCESS;
 
     int32_t handle_num = _get_included_row_index_stream_num();
-    _cache_handle = new(std::nothrow) Cache::Handle*[handle_num];
-    if (NULL == _cache_handle) {
-        OLAP_LOG_WARNING("fail to malloc index stream cache handle.");
-        return OLAP_ERR_MALLOC_ERROR;
-    }
-    memset(reinterpret_cast(_cache_handle), 0, sizeof(Cache::Handle*) * handle_num);
+    _cache_handle.resize(handle_num, nullptr);
 
     ReadOnlyFileStream stream(
             &_file_handler, &_shared_buffer, _decompressor,
-            _header_message().stream_buffer_size());
+            _header_message().stream_buffer_size(), _stats);
     res = stream.init();
     if (OLAP_SUCCESS != res) {
         OLAP_LOG_WARNING("fail to init stream. [res=%d]", res);
@@ -830,37 +745,6 @@ int32_t SegmentReader::_get_index_position(ColumnEncodingMessage::Kind encoding_
     return base;
 }
 
-bool SegmentReader::_is_dictionary(StreamInfoMessage::Kind kind,
-        ColumnEncodingMessage encoding) {
-    return kind == StreamInfoMessage::DICTIONARY_DATA ||
-           (kind == StreamInfoMessage::LENGTH &&
-                   (encoding.kind() == ColumnEncodingMessage::DICTIONARY));
-}
-
-bool SegmentReader::_is_overlap(size_t left_a, size_t right_a,
-        size_t left_b, size_t right_b) {
-    if (left_a <= left_b) {
-        return right_a >= left_b;
-    }
-
-    return  left_a <= right_b;
-}
-
-void SegmentReader::_fill_has_null(std::map* has_null) {
-    for (int32_t stream_index = 0; stream_index < _header_message().stream_info_size();
-            ++stream_index) {
-        const StreamInfoMessage& message = _header_message().stream_info(stream_index);
-        if (message.kind() == StreamInfoMessage::PRESENT) {
-            ColumnId unique_column_id = message.column_unique_id();
-            if (0 == _unique_id_to_segment_id_map.count(unique_column_id)) {
-                continue;
-            }
-
-            (*has_null)[unique_column_id] = true;
-        }
-    }
-}
-
 OLAPStatus SegmentReader::_read_all_data_streams(size_t* buffer_size) {
     int64_t stream_offset = _header_length;
     uint64_t stream_length = 0;
@@ -871,175 +755,104 @@ OLAPStatus SegmentReader::_read_all_data_streams(size_t* buffer_size) {
         const StreamInfoMessage& message = _header_message().stream_info(stream_index);
         stream_length = message.length();
         ColumnId unique_column_id = message.column_unique_id();
-        if (0 == _unique_id_to_segment_id_map.count(unique_column_id)) {
+
+        if (_unique_id_to_segment_id_map.count(unique_column_id) == 0) {
             continue;
         }
 
-        if ((_is_column_included(unique_column_id)
-                && message.kind() == StreamInfoMessage::ROW_INDEX)
-                || (_is_bf_column_included(unique_column_id)
-                && message.kind() == StreamInfoMessage::BLOOM_FILTER)) {
+        if (message.kind() == StreamInfoMessage::ROW_INDEX ||
+                message.kind() == StreamInfoMessage::BLOOM_FILTER) {
             continue;
-        } else {
-            StreamName name(unique_column_id, message.kind());
-            ReadOnlyFileStream* stream = new(std::nothrow) ReadOnlyFileStream(
-                    &_file_handler,
-                    &_shared_buffer,
-                    stream_offset,
-                    stream_length,
-                    _decompressor,
-                    _header_message().stream_buffer_size());
-            if (NULL == stream) {
-                OLAP_LOG_WARNING("fail to create stream");
-                return OLAP_ERR_MALLOC_ERROR;
-            }
-
-            OLAPStatus res = stream->init();
-            if (OLAP_SUCCESS != res) {
-                OLAP_LOG_WARNING("fail to init stream");
-                return res;
-            }
-
-            _streams[name] = stream;
-            *buffer_size += stream->get_buffer_size();
         }
+
+        StreamName name(unique_column_id, message.kind());
+        std::unique_ptr stream(new(std::nothrow) ReadOnlyFileStream(
+            &_file_handler,
+            &_shared_buffer,
+            stream_offset,
+            stream_length,
+            _decompressor,
+            _header_message().stream_buffer_size(), _stats));
+        if (stream == nullptr) {
+            OLAP_LOG_WARNING("fail to create stream");
+            return OLAP_ERR_MALLOC_ERROR;
+        }
+
+        OLAPStatus res = stream->init();
+        if (OLAP_SUCCESS != res) {
+            OLAP_LOG_WARNING("fail to init stream");
+            return res;
+        }
+
+        *buffer_size += stream->get_buffer_size();
+        _streams[name] = stream.release();
     }
 
     return OLAP_SUCCESS;
 }
 
-OLAPStatus SegmentReader::_reader_skip(uint64_t skip_rows) {
-    OLAPStatus res = OLAP_SUCCESS;
-
-    for (size_t i = 0; i < _column_readers.size(); ++i) {
-        res = _column_readers[i]->skip(skip_rows);
-
-        if (OLAP_SUCCESS != res && OLAP_ERR_COLUMN_STREAM_EOF != res) {
-            OLAP_LOG_WARNING("fail to skip reader");
-            break;
-        }
-    }
-
-    return res;
-}
-
 OLAPStatus SegmentReader::_create_reader(size_t* buffer_size) {
-    OLAPStatus res = OLAP_SUCCESS;
-
-    for (size_t i = 0; i < _return_columns.size(); ++i) {
-        // æ ¹æ®returncolumné¡ºåºæŽ’åˆ—
-        ColumnId table_column_id = _return_columns[i];
+    _column_readers.resize(_table->tablet_schema().size(), nullptr);
+    _column_indices.resize(_table->tablet_schema().size(), nullptr);
+    for (auto table_column_id : _used_columns) {
         ColumnId unique_column_id = _table_id_to_unique_id_map[table_column_id];
         // 当剿˜¯ä¸ä¼šå‡ºçްtableå’Œsegmentçš„schemaä¸ä¸€è‡´çš„æƒ…况的
-        ColumnReader* reader = ColumnReader::create(table_column_id,
+        std::unique_ptr reader(ColumnReader::create(table_column_id,
                                _table->tablet_schema(),
                                _unique_id_to_table_id_map,
                                _unique_id_to_segment_id_map,
-                               _encodings_map);
-        if (NULL == reader) {
+                               _encodings_map));
+        if (reader == nullptr) {
             OLAP_LOG_WARNING("fail to create reader");
-            return res;
+            return OLAP_ERR_MALLOC_ERROR;
         }
 
-        std::map::iterator it = _streams.begin();
-        res = reader->init(&_streams);
-        if (OLAP_SUCCESS != res) {
+        auto res = reader->init(&_streams, _num_rows_in_block, _mem_pool.get(), _stats);
+        if (res != OLAP_SUCCESS) {
             OLAP_LOG_WARNING("fail to init reader");
-            delete reader;
             return res;
         }
 
-        // _column_indices 的顺åºå’Œreader一致
-        _column_readers.push_back(reader);
-        if (0 == _unique_id_to_segment_id_map.count(unique_column_id)) {
-            _column_indices.push_back(NULL);
-        } else {
-            _column_indices.push_back(_indices[unique_column_id]);
-        }
         *buffer_size += reader->get_buffer_size();
+        _column_readers[table_column_id] = reader.release();
+        if (_indices.count(unique_column_id) != 0) {
+            _column_indices[table_column_id] = _indices[unique_column_id];
+        }
     }
 
     return OLAP_SUCCESS;
 }
 
-OLAPStatus SegmentReader::_move_to_next_row(bool without_filter) {
-    if (_current_row >= _header_message().number_of_rows()) {
-        _eof = true;
-        return OLAP_ERR_DATA_EOF;
-    }
-
-    if (_current_row % _num_rows_in_block != 0) {
-        ++_current_row;
+OLAPStatus SegmentReader::_seek_to_block_directly(
+        int64_t block_id, const std::vector& cids) {
+    if (_at_block_start && block_id == _current_block_id) {
+        // no need to execute seek
         return OLAP_SUCCESS;
     }
-
-    int64_t next_block = int64_t(_current_row / _num_rows_in_block);
-    if (!without_filter && NULL != _include_blocks && DEL_SATISFIED == _include_blocks[next_block]) {
-        while (next_block < _block_count && DEL_SATISFIED == _include_blocks[next_block]) {
-            ++next_block;
+    for (auto cid : cids) {
+        // If column is added through schema change, column index may not exist because of
+        // linked schema change. So we need to ignore this column's seek
+        if (_column_indices[cid] == nullptr) {
+            continue;
         }
-
-        if (next_block >= _block_count) {
-            _eof = true;
-            return OLAP_ERR_DATA_EOF;
-        }
-
-        _current_row = next_block * _num_rows_in_block;
-    } else if (next_block > _end_block) {
-        _eof = true;
-        return OLAP_ERR_DATA_EOF;
-    }
-
-    if (OLAP_UNLIKELY(next_block != _current_block || 0 == _current_row)) {
-        if (next_block > _current_block) {
-            OLAPStatus res = _seek_to_row_entry(next_block);
-            if (res == OLAP_SUCCESS) {
-                // seek to next_block will be successful in most case
-            } else if (res == OLAP_ERR_DATA_EOF) {
-                _eof = true;
-                return res;
-            } else {
-                OLAP_LOG_WARNING("fail to seek to next block. [res=%d]", res);
-                return res;
-            }
-        }
-
-        _current_block = next_block;
-    }
-
-    _current_row++;
-    return OLAP_SUCCESS;
-}
-
-OLAPStatus SegmentReader::_seek_to_row_entry(int64_t block_id) {
-
-    for (size_t i = 0; i < _column_readers.size(); ++i) {
-        if (block_id >= _block_count) {
-            return OLAP_ERR_DATA_EOF;
-        }
-
-        OLAPStatus res = OLAP_SUCCESS;
         
-        ColumnId table_column_id = _return_columns[i];
-        ColumnId unique_column_id = _table_id_to_unique_id_map[table_column_id];
-        if (0 == _unique_id_to_segment_id_map.count(unique_column_id)) {
-            continue;    
-        }
-        PositionProvider position(&_column_indices[i]->entry(block_id));
-        if (OLAP_SUCCESS != (res = _column_readers[i]->seek(&position))) {
+        OLAPStatus res = OLAP_SUCCESS;
+        PositionProvider position(&_column_indices[cid]->entry(block_id));
+        if (OLAP_SUCCESS != (res = _column_readers[cid]->seek(&position))) {
             if (OLAP_ERR_COLUMN_STREAM_EOF == res) {
                 OLAP_LOG_DEBUG("Stream EOF. [tablet_id=%ld column_id=%u block_id=%lu]",
-                        _table->tablet_id(), _column_readers[i]->column_unique_id(), block_id);
+                        _table->tablet_id(), _column_readers[cid]->column_unique_id(), block_id);
                 return OLAP_ERR_DATA_EOF;
             } else {
                 OLAP_LOG_WARNING("fail to seek to block. "
                         "[tablet_id=%ld column_id=%u block_id=%lu]",
-                        _table->tablet_id(), _column_readers[i]->column_unique_id(), block_id);
+                        _table->tablet_id(), _column_readers[cid]->column_unique_id(), block_id);
                 return OLAP_ERR_COLUMN_SEEK_ERROR;
             }
         }
     }
-
+    _current_block_id = block_id;
+    _at_block_start = true;
     return OLAP_SUCCESS;
 }
 
@@ -1059,6 +872,9 @@ OLAPStatus SegmentReader::_reset_readers() {
 
     for (std::vector::iterator it = _column_readers.begin();
             it != _column_readers.end(); ++it) {
+        if ((*it) == nullptr) {
+            continue;
+        }
         if (_runtime_state != NULL) {
             MemTracker::update_limits(
                     -1 * (*it)->get_buffer_size(), _runtime_state->mem_trackers());
@@ -1071,86 +887,50 @@ OLAPStatus SegmentReader::_reset_readers() {
     return OLAP_SUCCESS;
 }
 
-OLAPStatus SegmentReader::_read_block(bool without_filter) {
-    if (NULL != _include_blocks && !without_filter) {
-        // 如果当å‰rowå°äºŽæ€»row, 并探测当å‰row所在的blockæ˜¯ä¸æ˜¯åº”该被过滤
-        // 如果需è¦è¿‡æ»¤å°±è·³è¿‡ï¼Œç›®çš„æ˜¯æ‰¾åˆ°ç¬¬ä¸€ä¸ªéœ€è¦è¯»å–çš„block
-        while (_current_row < _header_message().number_of_rows() &&
-                DEL_SATISFIED == _include_blocks[(int)(_current_row / _num_rows_in_block)]) {
-            _current_row = _current_row + _num_rows_in_block;
+void SegmentReader::_seek_to_block(int64_t block_id, bool without_filter) {
+    if (_include_blocks != nullptr && !without_filter) {
+        while (block_id <= _end_block && _include_blocks[block_id] == DEL_SATISFIED) {
+            block_id++;
         }
     }
-
-    OLAP_LOG_DEBUG("--> entry %d cur %lu tol %lu",
-            (int)(_current_row / _num_rows_in_block),
-            _current_row, _header_message().number_of_rows());
-
-    if (_current_row < _header_message().number_of_rows()) {
-        // seek到起始ä½ç½®
-        return _seek_to_row_entry((int)(_current_row / _num_rows_in_block));
+    if (block_id > _end_block) {
+        _eof = true;
     }
-
-    return OLAP_SUCCESS;
+    _next_block_id = block_id;
 }
 
-OLAPStatus SegmentReader::get_row_batch(
-        uint8_t* batch_buf,
-        uint32_t batch_buf_len,
-        uint32_t* start_row_index,
-        uint32_t* batch_row_num,
-        uint32_t* block_row_num,
-        std::vector& return_columns) {
-    OLAPStatus res = OLAP_SUCCESS;
-
-    _init_vectorized_info(return_columns);
-
-    res = _seek_to_row_entry(_current_block);
-    if (OLAP_SUCCESS != res) {
-        OLAP_LOG_WARNING("fail to seek to row entry.[res=%d block_id=%d]", res, _current_block);
-        return res;
-    }
-
-    uint64_t start_row_in_block = (_current_row - 1) % _num_rows_in_block;
-
-    for (size_t i = 0; i < _column_readers.size(); ++i) {
-        if (return_columns.end() == std::find(return_columns.begin(),
-                return_columns.end(), _column_readers[i]->column_id())) {
-            continue;
-        }
-
-        uint32_t batch_size = _num_rows_in_block - start_row_in_block;
-        std::vector offset;
-
-        for (size_t j = 0; j < _vectorized_position.size(); ++j) {
-            if (_vectorized_position[j].column_id == _column_readers[i]->column_id()) {
-                offset.push_back(_vectorized_position[j].column_position);
-                offset.push_back(_vectorized_position[j].offset_position);
-                break;
-            }
-        }
-
-        if (offset.begin() == offset.end()) {
-            OLAP_LOG_WARNING("Fail to get offset![column_id=%d]",
-                    _column_readers[i]->column_id());
-            return OLAP_ERR_OTHER_ERROR;
-        }
-
-        res = _column_readers[i]->next_vector(
-                batch_buf, batch_buf_len, start_row_in_block, batch_size, offset);
-
-        if (OLAP_SUCCESS != res && OLAP_ERR_DATA_EOF != res) {
-            OLAP_LOG_WARNING("Fail to next vector.[res=%d column_unique_id=%d]",
-                    res, _column_readers[i]->column_unique_id());
+OLAPStatus SegmentReader::_load_to_vectorized_row_batch(
+        VectorizedRowBatch* batch, size_t size) {
+    SCOPED_RAW_TIMER(&_stats->block_load_ns);
+    MemPool* mem_pool = batch->mem_pool();
+    for (auto cid : batch->columns()) {
+        auto reader = _column_readers[cid];
+        auto res = reader->next_vector(batch->column(cid), size, mem_pool);
+        if (res != OLAP_SUCCESS) {
+            OLAP_LOG_WARNING("fail to read next, res = %d, column = %u",
+                    res, reader->column_unique_id());
             return res;
         }
     }
+    batch->set_size(size);
+    if (_include_blocks != nullptr) {
+        batch->set_block_status(_include_blocks[_current_block_id]);
+    } else {
+        batch->set_block_status(DEL_PARTIAL_SATISFIED);
+    }
+    // If size is just _num_rows_in_block, after read, we point to next block start,
+    // so we increase _current_block_id
+    if (size == _num_rows_in_block) {
+        _current_block_id++;
+    } else {
+        _at_block_start = false;
+    }
 
-    *start_row_index = start_row_in_block;
-    *batch_row_num = _num_rows_in_block - start_row_in_block;
-    *block_row_num = _num_rows_in_block;
+    _stats->blocks_load++;
+    _stats->raw_rows_read += size;
 
     return OLAP_SUCCESS;
 }
 
 }  // namespace column_file
-}  // namespace palo
+}  //unamespace palo
diff --git a/be/src/olap/column_file/segment_reader.h b/be/src/olap/column_file/segment_reader.h
index b3738fe066..dcef7d709a 100644
--- a/be/src/olap/column_file/segment_reader.h
+++ b/be/src/olap/column_file/segment_reader.h
@@ -39,7 +39,9 @@
 #include "olap/olap_table.h"
 #include "olap/row_cursor.h"
 #include "runtime/runtime_state.h"
-#include "runtime/mem_tracker.h"
+#include "runtime/mem_pool.h"
+
+#include "olap/column_predicate.h"
 
 namespace palo {
 namespace column_file {
@@ -56,9 +58,11 @@ public:
             const std::vector& return_columns,
             const std::set& load_bf_columns,
             const Conditions* conditions,
+            const std::vector* col_predicates,
             const DeleteHandler& delete_handler,
             const DelCondSatisfied delete_status,
-            RuntimeState* runtime_state);
+            RuntimeState* runtime_state,
+            OlapReaderStatistics* stats);
 
     ~SegmentReader();
 
@@ -69,6 +73,10 @@ public:
     // @return [description]
     OLAPStatus init(bool is_using_cache);
 
+    // Must called before seek to block.
+    // TODO(zc)
+    OLAPStatus prepare(const std::vector& columns);
+
     // 指定读å–的第一个block和最åŽä¸€ä¸ªblock,并åˆå§‹åŒ–column reader
     // seek_to_block支æŒè¢«å¤šæ¬¡è°ƒç”¨
     // Inputs:
@@ -78,18 +86,19 @@ public:
     // 1. 按conditions过滤index中的统计信æ¯,  确定需è¦è¯»å–çš„block列表
     // 2. 读å–blocks, 构造InStream
     // 3. 创建并åˆå§‹åŒ–Readers
-    //
-    OLAPStatus seek_to_block(uint32_t first_block, uint32_t last_block, bool without_filter);
+    // Outputs:
+    // next_block_id: 
+    //      block with next_block_id would read if get_block called again.
+    //      this field is used to set batch's limit when client found logical end is reach
+    OLAPStatus seek_to_block(uint32_t first_block, uint32_t last_block, bool without_filter, 
+                             uint32_t* next_block_id, bool* eof);
 
-    // 返回下一行数æ®
-    // @return 绑定数æ®çš„RowCursor,失败或无数æ®å¯è¯»åˆ™è¿”回NULL
-    const RowCursor* get_next_row(bool without_filter);
-
-    // 返回最åŽä¸€è¡Œæ•°æ®
-    // @return 绑定数æ®çš„RowCursor,失败或无数æ®å¯è¯»åˆ™è¿”回NULL
-    const RowCursor* get_current_row() const {
-        return (!_eof) ? &_cursor : NULL;
-    }
+    // get vector batch from this segment.
+    // next_block_id: 
+    //      block with next_block_id would read if get_block called again.
+    //      this field is used to set batch's limit when client found logical end is reach
+    // ATTN: If you change batch to contain more columns, you must call seek_to_block again.
+    OLAPStatus get_block(VectorizedRowBatch* batch, uint32_t* next_block_id, bool* eof);
 
     bool eof() const {
         return _eof;
@@ -100,11 +109,6 @@ public:
         return _block_count;
     }
 
-    // 返回当å‰è¡Œæ‰€å¤„çš„blockæ•°
-    uint32_t current_block() {
-        return _current_block;
-    }
-
     // 返回当å‰semgnet中,æ¯å—的行数
     uint32_t num_rows_in_block() {
         return _num_rows_in_block;
@@ -117,26 +121,7 @@ public:
     // åªå…许在åˆå§‹åŒ–之å‰é€‰æ‹©ï¼Œä¹‹åŽåˆ™æ— æ³•更改
     // 暂时没有动æ€åˆ‡æ¢çš„需求
     void set_is_using_mmap(bool is_using_mmap) {
-        if (!_is_init) {
-            _is_using_mmap = is_using_mmap;
-        } else {
-            OLAP_LOG_WARNING("segment reader has alreay inited, "
-                    "can't change is_using_mmap [now=%d]",
-                    _is_using_mmap);
-        }
-    }
-
-    OLAPStatus get_row_batch(
-            uint8_t* batch_buf,
-            uint32_t batch_buf_len,
-            uint32_t* start_row_index,
-            uint32_t* batch_row_num,
-            uint32_t* block_row_num,
-            std::vector& return_columns);
-
-    uint64_t get_filted_rows() const {
-        OLAP_LOG_DEBUG("SegmentReader _filted_rows: %lu", _filted_rows); 
-        return _filted_rows;
+        _is_using_mmap = is_using_mmap;
     }
 
 private:
@@ -187,10 +172,6 @@ private:
     // 设置segment的相关信æ¯ï¼Œè§£åŽ‹å™¨ï¼Œåˆ—ï¼Œç¼–ç ç­‰
     OLAPStatus _set_segment_info();
 
-    // æ£€æŸ¥æ˜¯å¦æœ‰ä¸ºç©ºçš„列
-    // @param has_null 返回一个 columnd->æ˜¯å¦æœ‰ç©ºå€¼ 的映射
-    void _fill_has_null(std::map* has_null);
-
     // 检查列存文件版本
     // @return 返回OLAP_SUCCESS代表版本检查通过
     OLAPStatus _check_file_version();
@@ -223,13 +204,6 @@ private:
             bool is_compressed,
             bool has_null);
 
-    // æ ¹æ®stream类型和编ç åˆ¤æ–­æ˜¯å¦é‡‡ç”¨äº†å­—典编ç 
-    bool _is_dictionary(StreamInfoMessage::Kind kind,
-            ColumnEncodingMessage encoding);
-
-    // 判断两段区域是å¦é‡å 
-    bool _is_overlap(size_t left_a, size_t right_a, size_t left_b, size_t right_b);
-
     // 读出所有列,完整的æµï¼Œï¼ˆè¿™é‡Œåªæ˜¯åˆ›å»ºstream,在orc file里因为没有mmapå› 
     // æ­¤æ„味ç€å®žé™…的数æ®è¯»å–, è€Œåœ¨è¿™é‡Œå¹¶æ²¡æœ‰å®žé™…çš„è¯»ï¼Œåªæ˜¯åœˆå‡ºæ¥éœ€è¦çš„范围)
     OLAPStatus _read_all_data_streams(size_t* buffer_size);
@@ -238,22 +212,19 @@ private:
     // 创建reader
     OLAPStatus _create_reader(size_t* buffer_size);
 
-    // è¯»å–æ•°æ®ï¼Œä¼šæ ¹æ®æœ‰æ²¡æœ‰æ¡ä»¶ï¼Œæ˜¯ä¸æ˜¯éœ€è¦æ‰«å…¨æ•°æ®æ¥åˆ†åˆ«ä½¿ç”¨
-    // _read_all_data_streams或_read_partial_data_streams
-    OLAPStatus _read_block(bool without_filter);
+    // we impelete seek to block in two phase. first, we just only move _next_block_id
+    // to the position that we want goto; second, we seek the column streams to the
+    // position we going to read.
+    void _seek_to_block(int64_t block_id, bool without_filter);
 
-    // 所有reader skip一定row
-    OLAPStatus _reader_skip(uint64_t skip_rows);
-
-    // å‰è¿›n行
-    OLAPStatus _move_to_next_row(bool without_filter);
+    // seek to block id without check. only seek in cids's read stream.
+    // because some columns may not be read
+    OLAPStatus _seek_to_block_directly(
+        int64_t block_id, const std::vector& cids);
 
     // 跳转到æŸä¸ªrow entry
     OLAPStatus _seek_to_row_entry(int64_t block_id);
 
-    // 读å–下一行并attach到cursor上
-    inline OLAPStatus _read_next_and_attach();
-
     OLAPStatus _reset_readers();
 
     // 获å–当å‰çš„table级schema。
@@ -267,8 +238,6 @@ private:
 
     OLAPStatus _init_include_blocks(uint32_t first_block, uint32_t last_block);
 
-    void _init_vectorized_info(std::vector& return_columns);
-
     inline const int32_t _get_included_row_index_stream_num() {
         int32_t included_row_index_stream_num = 0;
         for (int32_t i = 0; i < _header_message().stream_info_size(); ++i) {
@@ -288,6 +257,9 @@ private:
         return included_row_index_stream_num;
     }
 
+    OLAPStatus _load_to_vectorized_row_batch(
+        VectorizedRowBatch* batch, size_t size);
+
 private:
     static const int32_t BYTE_STREAM_POSITIONS = 1;
     static const int32_t RUN_LENGTH_BYTE_POSITIONS = BYTE_STREAM_POSITIONS + 1;
@@ -309,20 +281,33 @@ private:
     const Conditions* _conditions;         // 列过滤æ¡ä»¶
     DeleteHandler _delete_handler;
     DelCondSatisfied _delete_status;
-    RowCursor _cursor;                     // 返回数æ®ä½¿ç”¨çš„helper cursor
 
     bool _eof;                             // eof标志
-    bool _is_init;
+
+    // If this field is false, client must to call seek_to_block before
+    // calling get_block.
+    bool _at_block_start = false;
+
     int64_t _end_block;                           // 本次读å–的结æŸå—
-    int64_t _current_block;                       // 当å‰è¯»å–到的å—
+    int64_t _current_block_id = 0;                       // 当å‰è¯»å–到的å—
+
+    // this is set by _seek_to_block, when get_block is called, first
+    // seek to this block_id, then read block.
+    int64_t _next_block_id = 0;
     int64_t _block_count;             // æ¯ä¸€åˆ—中,index entry的数目应该相等。
 
     uint64_t _num_rows_in_block;
     bool _null_supported;
     uint64_t _header_length;           // Header(FixHeader+PB)大å°ï¼Œè¯»æ•°æ®æ—¶éœ€è¦åç§»
-    uint64_t _current_row;             // 当å‰row在整个segment中是第几æ¡
 
-    std::vector _return_columns; // è¦è¿”回的列,里边的id为tablet_schema_id
+    // columns that can be used by client. when client seek to range's start or end,
+    // client may read more columns than normal read.
+    // For example: 
+    //  table1's schema is 'k1, k2, v1'. which k1, k2 is key column, v1 is value column.
+    //  for query 'select sum(v1) from table1', client split all data to sub-range in logical,
+    //  so, one sub-range need to seek to right position with k1 and k2; then only read v1.
+    //  In this situation, _used_columns contains (k1, k2, v1)
+    std::vector _used_columns;
     std::vector _column_readers;    // 实际的数æ®è¯»å–器
     std::vector _column_indices; // ä¿å­˜columnçš„index
 
@@ -333,7 +318,6 @@ private:
     UniqueIdToColumnIdMap _table_id_to_unique_id_map; // table id到unique id的映射
     UniqueIdToColumnIdMap _unique_id_to_table_id_map; // unique id到table id的映射
     UniqueIdToColumnIdMap _unique_id_to_segment_id_map; // uniqid到segment id的映射
-    UniqueIdToColumnIdMap _segment_id_to_unique_id_map; //segment id到uniqid的映射
 
     std::map _indices;
     std::map _streams;      //需è¦è¯»å–çš„æµ
@@ -353,49 +337,29 @@ private:
     */
     uint8_t* _include_blocks;
     uint32_t _remain_block;
-    uint64_t _filted_rows;
     bool _need_block_filter;   //与include blocks组åˆä½¿ç”¨ï¼Œå¦‚果全ä¸ä¸­ï¼Œå°±ä¸å†è¯»
     bool _is_using_mmap;                     // 这个标记为true时,使用mmapæ¥è¯»å–文件
     bool _is_data_loaded;
     size_t _buffer_size;
 
     Cache* _lru_cache;
-    Cache::Handle** _cache_handle;
+    std::vector _cache_handle;
     FileHeader _file_header;
 
-    bool _vectorized_info_inited;
-    std::vector _vectorized_position;
+    std::unique_ptr _tracker;
+    std::unique_ptr _mem_pool;
 
     RuntimeState* _runtime_state;  // 用于统计内存消耗等è¿è¡Œæ—¶ä¿¡æ¯
     ByteBuffer* _shared_buffer;
 
+    // Set when seek_to_block is called, valid until next seek_to_block is called.
+    bool _without_filter = false;
+
+    OlapReaderStatistics* _stats;
+
     DISALLOW_COPY_AND_ASSIGN(SegmentReader);
 };
 
-inline OLAPStatus SegmentReader::_read_next_and_attach() {
-    OLAPStatus res = OLAP_SUCCESS;
-
-    for (std::vector::iterator it = _column_readers.begin();
-            it != _column_readers.end(); ++it) {
-        res = (*it)->next();
-
-        if (OLAP_SUCCESS != res) {
-            OLAP_LOG_WARNING("fail to read next, res = %d, column = %u",
-                    res, (*it)->column_unique_id());
-            return res;
-        }
-
-        res = (*it)->attach(&_cursor);
-
-        if (OLAP_SUCCESS != res) {
-            OLAP_LOG_WARNING("fail to attach reader. [res=%d]", res);
-            return res;
-        }
-    }
-
-    return res;
-}
-
 }  // namespace column_file
 }  // namespace palo
 
diff --git a/be/src/olap/column_file/serialize.cpp b/be/src/olap/column_file/serialize.cpp
index c9842f6e3e..97120ffc15 100644
--- a/be/src/olap/column_file/serialize.cpp
+++ b/be/src/olap/column_file/serialize.cpp
@@ -253,34 +253,65 @@ OLAPStatus read_ints(ReadOnlyFileStream* input, int64_t* data, uint32_t count, u
     uint32_t bits_left = 0;
     char current = '\0';
 
-    for (uint32_t i = 0; i < count; i++) {
-        int64_t result = 0;
-        uint32_t bits_left_to_read = bit_width;
+    uint32_t read_bytes = (count * bit_width - 1) / 8 + 1;
+    uint32_t remaining_bytes = 0;
+    char* buf = nullptr;
+    input->get_buf(&buf, &remaining_bytes);
+    if (read_bytes <= remaining_bytes) {
+        uint32_t pos = 0;
+        input->get_position(&pos);
+        for (uint32_t i = 0; i < count; i++) {
+            int64_t result = 0;
+            uint32_t bits_left_to_read = bit_width;
 
-        while (bits_left_to_read > bits_left) {
-            result <<= bits_left;
-            result |= current & ((1 << bits_left) - 1);
-            bits_left_to_read -= bits_left;
-            res = input->read(¤t);
-
-            if (OLAP_UNLIKELY(OLAP_SUCCESS != res)) {
-                OLAP_LOG_WARNING("fail to write byte to stream.[res=%d]", res);
-                return res;
+            while (bits_left_to_read > bits_left) {
+                result <<= bits_left;
+                result |= current & ((1 << bits_left) - 1);
+                bits_left_to_read -= bits_left;
+                current = buf[pos++];
+                bits_left = 8;
             }
 
-            bits_left = 8;
-        }
+            // handle the left over bits
+            if (bits_left_to_read > 0) {
+                result <<= bits_left_to_read;
+                bits_left -= bits_left_to_read;
+                result |= (current >> bits_left) & ((1 << bits_left_to_read) - 1);
+            }
 
-        // handle the left over bits
-        if (bits_left_to_read > 0) {
-            result <<= bits_left_to_read;
-            bits_left -= bits_left_to_read;
-            result |= (current >> bits_left) & ((1 << bits_left_to_read) - 1);
+            data[i] = result;
         }
+        input->set_position(pos);
+    } else {
+        for (uint32_t i = 0; i < count; i++) {
+            int64_t result = 0;
+            uint32_t bits_left_to_read = bit_width;
 
-        data[i] = result;
+            while (bits_left_to_read > bits_left) {
+                result <<= bits_left;
+                result |= current & ((1 << bits_left) - 1);
+                bits_left_to_read -= bits_left;
+                res = input->read(¤t);
+
+                if (OLAP_UNLIKELY(OLAP_SUCCESS != res)) {
+                    OLAP_LOG_WARNING("fail to write byte to stream.[res=%d]", res);
+                    return res;
+                }
+
+                bits_left = 8;
+            }
+
+            // handle the left over bits
+            if (bits_left_to_read > 0) {
+                result <<= bits_left_to_read;
+                bits_left -= bits_left_to_read;
+                result |= (current >> bits_left) & ((1 << bits_left_to_read) - 1);
+            }
+
+            data[i] = result;
+        }
     }
-    
+
     return res;
 }
 
diff --git a/be/src/olap/column_file/serialize.h b/be/src/olap/column_file/serialize.h
index a6c469710e..c242c0bc2c 100644
--- a/be/src/olap/column_file/serialize.h
+++ b/be/src/olap/column_file/serialize.h
@@ -17,6 +17,7 @@
 #define BDG_PALO_BE_SRC_OLAP_COLUMN_FILE_SERIALIZE_H
 
 #include "olap/olap_define.h"
+#include "olap/column_file/byte_buffer.h"
 
 namespace palo {
 namespace column_file {
diff --git a/be/src/olap/column_file/stream_index_common.cpp b/be/src/olap/column_file/stream_index_common.cpp
index 302877780f..e4b2d0b426 100755
--- a/be/src/olap/column_file/stream_index_common.cpp
+++ b/be/src/olap/column_file/stream_index_common.cpp
@@ -17,6 +17,7 @@
 
 #include "olap/column_file/stream_index_common.h"
 #include "olap/field.h"
+#include "olap/wrapper_field.h"
 
 namespace palo {
 namespace column_file {
@@ -37,17 +38,14 @@ OLAPStatus ColumnStatistics::init(const FieldType& type, bool null_supported) {
     SAFE_DELETE(_minimum);
     SAFE_DELETE(_maximum);
     // 当数æ®ç±»åž‹ä¸º Stringå’Œvarchar或是未知类型时,实际上ä¸ä¼šæœ‰ç»Ÿè®¡ä¿¡æ¯ã€‚
-    _minimum = Field::create_by_type(type);
-    _maximum = Field::create_by_type(type);
+    _minimum = WrapperField::create_by_type(type);
+    _maximum = WrapperField::create_by_type(type);
 
     _null_supported = null_supported;
     if (NULL == _minimum || NULL == _maximum) {
         _ignored = true;
     } else {
         _ignored = false;
-        memset(_buf, 0, MAX_STATISTIC_LENGTH);
-        _minimum->attach_field(_buf);
-        _maximum->attach_field(_buf + _minimum->field_size());
         reset();
     }
 
@@ -64,17 +62,17 @@ void ColumnStatistics::reset() {
     }
 }
 
-void ColumnStatistics::add(const Field* field) {
+void ColumnStatistics::add(char* buf) {
     if (_ignored) {
         return;
     }
 
-    if (field->cmp(_maximum) > 0) {
-        _maximum->copy(field);
+    if (_maximum->cmp(buf) < 0) {
+        _maximum->copy(buf);
     }
 
-    if (field->cmp(_minimum) < 0) {
-        _minimum->copy(field);
+    if (_minimum->cmp(buf) > 0) {
+        _minimum->copy(buf);
     }
 }
 
@@ -127,7 +125,17 @@ OLAPStatus ColumnStatistics::write_to_buffer(char* buffer, size_t size) {
         return OLAP_ERR_BUFFER_OVERFLOW;
     }
 
-    memcpy(buffer, _buf, this->size());
+    // TODO(zc): too ugly
+    if (_null_supported) {
+        size_t cpy_size = _minimum->field_size();
+        memcpy(buffer, _minimum->get_null(), cpy_size);
+        memcpy(buffer + cpy_size, _maximum->get_null(), cpy_size);
+    } else {
+        size_t cpy_size = _minimum->size();
+        memcpy(buffer, _minimum->ptr(), cpy_size);
+        memcpy(buffer + cpy_size, _maximum->ptr(), cpy_size);
+    }
+
     return OLAP_SUCCESS;
 }
 
diff --git a/be/src/olap/column_file/stream_index_common.h b/be/src/olap/column_file/stream_index_common.h
index 19a08a387f..a1b386f617 100755
--- a/be/src/olap/column_file/stream_index_common.h
+++ b/be/src/olap/column_file/stream_index_common.h
@@ -16,10 +16,14 @@
 #ifndef BDG_PALO_BE_SRC_OLAP_COLUMN_FILE_STREAM_INDEX_COMMON_H
 #define BDG_PALO_BE_SRC_OLAP_COLUMN_FILE_STREAM_INDEX_COMMON_H
 
+#include 
+
 #include "olap/field.h"
+#include "olap/wrapper_field.h"
 #include "olap/olap_define.h"
 
 namespace palo {
+
 namespace column_file {
 
 // æè¿°streamindex的格å¼
@@ -34,6 +38,9 @@ struct StreamIndexHeader {
             statistic_format(OLAP_FIELD_TYPE_NONE) {}
 } __attribute__((packed));
 
+// TODO: string type(char, varchar) has no columnar statistics at present.
+// when you want to add columnar statistics for string type,
+// don't forget to convert storage layout between disk and memory.
 // 处ç†åˆ—的统计信æ¯ï¼Œè¯»å†™ä¸€ä½“,也å¯ä»¥åˆ†å¼€ã€‚
 class ColumnStatistics {
 public:
@@ -46,11 +53,11 @@ public:
     // åªæ˜¯reset最大和最å°å€¼ï¼Œå°†æœ€å°å€¼è®¾ç½®ä¸ºMAX,将最大值设置为MIN。
     void reset();
     // 增加一个值,根æ®ä¼ å…¥å€¼è°ƒæ•´æœ€å¤§æœ€å°å€¼
-    void add(const Field* field);
+    void add(char* buf);
     // åˆå¹¶ï¼Œå°†å¦ä¸€ä¸ªç»Ÿè®¡ä¿¡æ¯å’Œå…¥å½“å‰ç»Ÿè®¡ä¸­
     void merge(ColumnStatistics* other);
-    // 返回最大最å°å€¼â€œè¾“出时â€å ç”¨çš„å†…å­˜ï¼Œè€Œâ€œä¸æ˜¯â
-    // €å½“å‰ç»“æž„å ç”¨çš„内存大å°
+    // 返回最大最å°å€¼â€œè¾“出时â€å ç”¨çš„å†…å­˜ï¼Œè€Œâ€œä¸æ˜¯?
+    // ??当å‰ç»“æž„å ç”¨çš„内存大å°
     size_t size() const;
     // 将最大最å°å€¼attach到给定的buffer上
     void attach(char* buffer);
@@ -58,19 +65,21 @@ public:
     OLAPStatus write_to_buffer(char* buffer, size_t size);
 
     // 属性
-    inline const Field* minimum() const {
+    const WrapperField* minimum() const {
         return _minimum;
     }
-    inline const Field* maximum() const {
+    const WrapperField* maximum() const {
         return _maximum;
     }
+    std::pair pair() const {
+        return std::make_pair(_minimum, _maximum);
+    }
     bool ignored() const {
         return _ignored;
     }
 protected:
-    Field* _minimum;
-    Field* _maximum;
-    char _buf[MAX_STATISTIC_LENGTH]; // field刚分é…å‡ºæ¥æ—¶æ˜¯æ²¡æœ‰å†…存的,必须注æ„,
+    WrapperField* _minimum;
+    WrapperField* _maximum;
     // ç”±äºŽæš‚æ—¶ä¸æ”¯æŒstring的统计信æ¯ï¼Œä¸ºäº†æ–¹ä¾¿ç›´æŽ¥å®šä¹‰é•¿åº¦
     // 也å¯ä»¥æ¯æ¬¡éƒ½åˆ†é…
     bool _ignored;
diff --git a/be/src/olap/column_predicate.h b/be/src/olap/column_predicate.h
new file mode 100644
index 0000000000..dde759c82b
--- /dev/null
+++ b/be/src/olap/column_predicate.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef BDG_PALO_BE_SRC_OLAP_COLUMN_PREDICATE_H
+#define BDG_PALO_BE_SRC_OLAP_COLUMN_PREDICATE_H
+
+namespace palo {
+
+class VectorizedRowBatch;
+
+class ColumnPredicate {
+public:
+    virtual ~ColumnPredicate() {}
+
+    //evaluate predicate on VectorizedRowBatch
+    virtual void evaluate(VectorizedRowBatch* batch) const = 0;
+};
+
+} //namespace palo
+
+#endif //BDG_PALO_BE_SRC_OLAP_COLUMN_PREDICATE_H
diff --git a/be/src/olap/command_executor.cpp b/be/src/olap/command_executor.cpp
index 98f3c951ff..737e8cf61e 100755
--- a/be/src/olap/command_executor.cpp
+++ b/be/src/olap/command_executor.cpp
@@ -30,7 +30,7 @@
 #include 
 #include 
 
-#include "olap/base_expansion_handler.h"
+#include "olap/base_compaction.h"
 #include "olap/delete_handler.h"
 #include "olap/field.h"
 #include "olap/olap_common.h"
@@ -43,6 +43,7 @@
 #include "olap/schema_change.h"
 #include "olap/utils.h"
 #include "util/palo_metrics.h"
+#include "util/pretty_printer.h"
 
 using apache::thrift::ThriftDebugString;
 using std::map;
@@ -126,19 +127,12 @@ OLAPStatus CommandExecutor::compute_checksum(
         OLAP_LOG_WARNING("failed to init row cursor. [res=%d]", res);
         return res;
     }
+    row.allocate_memory_for_string_type(tablet->tablet_schema());
 
-    RowCursor tmp_row;
-    res = tmp_row.init(tablet->tablet_schema(), reader_params.return_columns);
-    if (res != OLAP_SUCCESS) {
-        OLAP_LOG_WARNING("failed to init row cursor. [res=%d]", res);
-        return res;
-    }
-    
     bool eof = false;
-    int64_t raw_rows_read = 0;
-    uint32_t tmp_checksum = CRC32_INIT;
+    uint32_t row_checksum = 0;
     while (true) {
-        OLAPStatus res = reader.next_row_with_aggregation(&tmp_row, &raw_rows_read, &eof);
+        OLAPStatus res = reader.next_row_with_aggregation(&row, &eof);
         if (res == OLAP_SUCCESS && eof) {
             OLAP_LOG_DEBUG("reader reads to the end.");
             break;
@@ -147,14 +141,11 @@ OLAPStatus CommandExecutor::compute_checksum(
             return res;
         }
 
-        // reset buffer and copy from tmp_row to avoid invalid content in varchar buffer
-        row.reset_buf();
-        row.copy(tmp_row);
-        tmp_checksum = olap_crc32(tmp_checksum, row.get_buf(), row.get_buf_len());
+        row_checksum = row.hash_code(row_checksum);
     }
-    
-    OLAP_LOG_INFO("success to finish compute checksum. [checksum=%u]", tmp_checksum);
-    *checksum = tmp_checksum;
+
+    OLAP_LOG_INFO("success to finish compute checksum. [checksum=%u]", row_checksum);
+    *checksum = row_checksum;
     return OLAP_SUCCESS;
 }
 
@@ -165,13 +156,9 @@ OLAPStatus CommandExecutor::push(
     OLAP_LOG_INFO("begin to process push. [tablet_id=%ld version=%ld]",
                   request.tablet_id, request.version);
 
-    time_t start = time(NULL);
-    if (PaloMetrics::palo_push_count() != NULL) {
-        PaloMetrics::palo_push_count()->increment(1);
-    }
-
     if (tablet_info_vec == NULL) {
         OLAP_LOG_WARNING("invalid output parameter which is null pointer.");
+        PaloMetrics::push_requests_fail_total.increment(1);
         return OLAP_ERR_CE_CMD_PARAMS_ERROR;
     }
 
@@ -180,6 +167,7 @@ OLAPStatus CommandExecutor::push(
     if (NULL == olap_table.get()) {
         OLAP_LOG_WARNING("false to find table. [table=%ld schema_hash=%d]",
                          request.tablet_id, request.schema_hash);
+        PaloMetrics::push_requests_fail_total.increment(1);
         return OLAP_ERR_TABLE_NOT_FOUND;
     }
 
@@ -188,26 +176,32 @@ OLAPStatus CommandExecutor::push(
         type = PUSH_FOR_LOAD_DELETE;
     }
 
+    int64_t duration_ns = 0;
     PushHandler push_handler;
-    res = push_handler.process(olap_table, request, type, tablet_info_vec);
-
-    time_t cost = time(NULL) - start;
-    if (res != OLAP_SUCCESS) {
-        OLAP_LOG_WARNING("fail to process push. cost: %ld [res=%d table=%s]",
-                         cost, res, olap_table->full_name().c_str());
-        return res;
+    {
+        SCOPED_RAW_TIMER(&duration_ns);
+        res = push_handler.process(olap_table, request, type, tablet_info_vec);
+    }
+    if (res != OLAP_SUCCESS) {
+        LOG(WARNING) << "fail to push delta, table=" << olap_table->full_name().c_str()
+            << ",cost=" << PrettyPrinter::print(duration_ns, TUnit::TIME_NS);
+        PaloMetrics::push_requests_fail_total.increment(1);
+    } else {
+        LOG(INFO) << "success to push delta, table=" << olap_table->full_name().c_str()
+            << ",cost=" << PrettyPrinter::print(duration_ns, TUnit::TIME_NS);
+        PaloMetrics::push_requests_success_total.increment(1);
+        PaloMetrics::push_request_duration_us.increment(duration_ns / 1000);
+        PaloMetrics::push_request_write_bytes.increment(push_handler.write_bytes());
+        PaloMetrics::push_request_write_rows.increment(push_handler.write_rows());
     }
-
-    OLAP_LOG_INFO("success to finish push. cost: %ld. [table=%s]",
-            cost, olap_table->full_name().c_str());
     return res;
 }
 
-OLAPStatus CommandExecutor::base_expansion(
+OLAPStatus CommandExecutor::base_compaction(
         TTabletId tablet_id,
         TSchemaHash schema_hash,
         TVersion version) {
-    OLAP_LOG_INFO("begin to process base expansion. "
+    OLAP_LOG_INFO("begin to process base compaction. "
                   "[tablet_id=%ld schema_hash=%d version=%ld]",
                   tablet_id, schema_hash, version);
     OLAPStatus res = OLAP_SUCCESS;
@@ -220,19 +214,19 @@ OLAPStatus CommandExecutor::base_expansion(
         return OLAP_ERR_TABLE_NOT_FOUND;
     }
 
-    BaseExpansionHandler base_expansion_handler;
-    res = base_expansion_handler.init(table, true);
+    BaseCompaction base_compaction;
+    res = base_compaction.init(table, true);
     if (res != OLAP_SUCCESS) {
-        OLAP_LOG_WARNING("fail to init BaseExpansionHandler. [res=%d]", res);
+        OLAP_LOG_WARNING("fail to init BaseCompactionHandler. [res=%d]", res);
         return res;
     }
 
-    res = base_expansion_handler.run();
+    res = base_compaction.run();
     if (res != OLAP_SUCCESS) {
-        OLAP_LOG_WARNING("fail to process base_expansion. [res=%d]", res);
+        OLAP_LOG_WARNING("fail to process base_compaction. [res=%d]", res);
     }
 
-    OLAP_LOG_INFO("success to finish base expansion.");
+    OLAP_LOG_INFO("success to finish base compaction.");
     return res;
 }
 
@@ -244,9 +238,7 @@ OLAPStatus CommandExecutor::create_table(const TCreateTabletReq& request) {
     OLAP_LOG_INFO("begin to process create table. [tablet=%ld, schema_hash=%d]",
                   request.tablet_id, request.tablet_schema.schema_hash);
 
-    if (PaloMetrics::palo_request_count() != NULL) {
-        PaloMetrics::palo_request_count()->increment(1);
-    }
+    PaloMetrics::create_tablet_requests_total.increment(1);
 
     // 1. Make sure create_table operation is idempotent:
     //    return success if table with same tablet_id and schema_hash exist,
@@ -340,9 +332,7 @@ OLAPStatus CommandExecutor::drop_table(const TDropTabletReq& request) {
     OLAP_LOG_INFO("begin to process drop table. [table=%ld schema_hash=%d]",
                   request.tablet_id, request.schema_hash);
 
-    if (PaloMetrics::palo_request_count() != NULL) {
-        PaloMetrics::palo_request_count()->increment(1);
-    }
+    PaloMetrics::drop_tablet_requests_total.increment(1);
 
     OLAPStatus res = OLAPEngine::get_instance()->drop_table(
             request.tablet_id, request.schema_hash);
@@ -359,9 +349,7 @@ OLAPStatus CommandExecutor::report_all_tablets_info(
         map* tablets_info) {
     OLAP_LOG_INFO("begin to process report all tablets info.");
 
-    if (PaloMetrics::palo_request_count() != NULL) {
-        PaloMetrics::palo_request_count()->increment(1);
-    }
+    PaloMetrics::report_all_tablets_requests_total.increment(1);
 
     OLAPStatus res = OLAP_SUCCESS;
 
@@ -382,9 +370,7 @@ OLAPStatus CommandExecutor::report_tablet_info(TTabletInfo* tablet_info) {
                   "[table=%ld schema_hash=%d]",
                   tablet_info->tablet_id, tablet_info->schema_hash);
 
-    if (PaloMetrics::palo_request_count() != NULL) {
-        PaloMetrics::palo_request_count()->increment(1);
-    }
+    PaloMetrics::report_tablet_requests_total.increment(1);
 
     res = OLAPEngine::get_instance()->report_tablet_info(tablet_info);
     if (res != OLAP_SUCCESS) {
@@ -400,9 +386,7 @@ OLAPStatus CommandExecutor::schema_change(const TAlterTabletReq& request) {
     OLAP_LOG_INFO("begin to schema change. [base_table=%ld new_table=%ld]",
                   request.base_tablet_id, request.new_tablet_req.tablet_id);
 
-    if (PaloMetrics::palo_request_count() != NULL) {
-        PaloMetrics::palo_request_count()->increment(1);
-    }
+    PaloMetrics::schema_change_requests_total.increment(1);
 
     OLAPStatus res = OLAP_SUCCESS;
 
@@ -427,9 +411,7 @@ OLAPStatus CommandExecutor::create_rollup_table(const TAlterTabletReq& request)
                   "[base_table=%ld new_table=%ld]",
                   request.base_tablet_id, request.new_tablet_req.tablet_id);
 
-    if (PaloMetrics::palo_request_count() != NULL) {
-        PaloMetrics::palo_request_count()->increment(1);
-    }
+    PaloMetrics::create_rollup_requests_total.increment(1);
 
     OLAPStatus res = OLAP_SUCCESS;
 
@@ -456,10 +438,6 @@ AlterTableStatus CommandExecutor::show_alter_table_status(
                   "[table=%ld schema_hash=%d]",
                   tablet_id, schema_hash);
 
-    if (PaloMetrics::palo_request_count() != NULL) {
-        PaloMetrics::palo_request_count()->increment(1);
-    }
-
     AlterTableStatus status = ALTER_TABLE_DONE;
 
     SmartOLAPTable table = OLAPEngine::get_instance()->get_table(tablet_id, schema_hash);
@@ -605,9 +583,7 @@ OLAPStatus CommandExecutor::storage_medium_migrate(const TStorageMediumMigrateRe
                   "[tablet_id=%ld schema_hash=%d dest_storage_medium=%d]",
                   request.tablet_id, request.schema_hash, request.storage_medium);
 
-    if (PaloMetrics::palo_request_count() != NULL) {
-        PaloMetrics::palo_request_count()->increment(1);
-    }
+    PaloMetrics::storage_migrate_requests_total.increment(1);
 
     OLAPStatus res = OLAP_SUCCESS;
     res = OLAPSnapshot::get_instance()->storage_medium_migrate(
@@ -640,10 +616,7 @@ OLAPStatus CommandExecutor::delete_data(
         vector* tablet_info_vec) {
     OLAP_LOG_INFO("begin to process delete data. [request='%s']",
                   ThriftDebugString(request).c_str());
-
-    if (PaloMetrics::palo_request_count() != NULL) {
-        PaloMetrics::palo_request_count()->increment(1);
-    }
+    PaloMetrics::delete_requests_total.increment(1);
 
     OLAPStatus res = OLAP_SUCCESS;
 
@@ -680,9 +653,7 @@ OLAPStatus CommandExecutor::cancel_delete(const TCancelDeleteDataReq& request) {
     OLAP_LOG_INFO("begin to process cancel delete. [table=%ld version=%ld]",
                   request.tablet_id, request.version);
 
-    if (PaloMetrics::palo_request_count() != NULL) {
-        PaloMetrics::palo_request_count()->increment(1);
-    }
+    PaloMetrics::cancel_delete_requests_total.increment(1);
 
     OLAPStatus res = OLAP_SUCCESS;
 
diff --git a/be/src/olap/command_executor.h b/be/src/olap/command_executor.h
index 87b9a242fb..c1b00c1749 100644
--- a/be/src/olap/command_executor.h
+++ b/be/src/olap/command_executor.h
@@ -178,12 +178,12 @@ public:
     // @return OLAP_SUCCESS if cancel success
     virtual OLAPStatus cancel_delete(const TCancelDeleteDataReq& request);
 
-    // Start base expansion to expand base delta to version manually.
+    // Start base compaction to expand base delta to version manually.
     //
     // @param [in] tablet_id & schema_hash specify tablet
-    // @param [in] version specify base expansion range
+    // @param [in] version specify base compaction range
     // @return OLAP_SUCCESS if start be success
-    virtual OLAPStatus base_expansion(TTabletId tablet_id,
+    virtual OLAPStatus base_compaction(TTabletId tablet_id,
             TSchemaHash schema_hash,
             TVersion version);
 
diff --git a/be/src/olap/comparison_predicate.cpp b/be/src/olap/comparison_predicate.cpp
new file mode 100644
index 0000000000..1d72178daa
--- /dev/null
+++ b/be/src/olap/comparison_predicate.cpp
@@ -0,0 +1,151 @@
+// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "olap/comparison_predicate.h"
+#include "olap/field.h"
+#include "runtime/string_value.hpp"
+#include "runtime/vectorized_row_batch.h"
+
+namespace palo {
+
+#define COMPARISON_PRED_CONSTRUCTOR(CLASS) \
+    template \
+    CLASS::CLASS(int column_id, const type& value) \
+        : _column_id(column_id), \
+          _value(value) \
+        {} \
+
+COMPARISON_PRED_CONSTRUCTOR(EqualPredicate)
+COMPARISON_PRED_CONSTRUCTOR(NotEqualPredicate)
+COMPARISON_PRED_CONSTRUCTOR(LessPredicate)
+COMPARISON_PRED_CONSTRUCTOR(LessEqualPredicate)
+COMPARISON_PRED_CONSTRUCTOR(GreaterPredicate)
+COMPARISON_PRED_CONSTRUCTOR(GreaterEqualPredicate)
+
+#define COMPARISON_PRED_CONSTRUCTOR_STRING(CLASS) \
+    template<> \
+    CLASS::CLASS(int column_id, const StringValue& value) \
+        : _column_id(column_id) \
+        { \
+            _value.len = value.len; \
+            _value.ptr = value.ptr; \
+        } \
+
+COMPARISON_PRED_CONSTRUCTOR_STRING(EqualPredicate)
+COMPARISON_PRED_CONSTRUCTOR_STRING(NotEqualPredicate)
+COMPARISON_PRED_CONSTRUCTOR_STRING(LessPredicate)
+COMPARISON_PRED_CONSTRUCTOR_STRING(LessEqualPredicate)
+COMPARISON_PRED_CONSTRUCTOR_STRING(GreaterPredicate)
+COMPARISON_PRED_CONSTRUCTOR_STRING(GreaterEqualPredicate)
+
+#define COMPARISON_PRED_EVALUATE(CLASS, OP) \
+    template \
+    void CLASS::evaluate(VectorizedRowBatch* batch) const { \
+        uint16_t n = batch->size(); \
+        if (n == 0) { \
+            return; \
+        } \
+        uint16_t* sel = batch->selected(); \
+        const type* col_vector = reinterpret_cast(batch->column(_column_id)->col_data()); \
+        uint16_t new_size = 0; \
+        if (batch->column(_column_id)->no_nulls()) { \
+            if (batch->selected_in_use()) { \
+                for (uint16_t j = 0; j !=n; ++j) { \
+                    uint16_t i = sel[j]; \
+                    sel[new_size] = i; \
+                    new_size += (col_vector[i] OP _value); \
+                } \
+                batch->set_size(new_size); \
+            } else { \
+                for (uint16_t i = 0; i !=n; ++i) { \
+                    sel[new_size] = i; \
+                    new_size += (col_vector[i] OP _value); \
+                } \
+                if (new_size < n) { \
+                    batch->set_size(new_size); \
+                    batch->set_selected_in_use(true); \
+                } \
+            } \
+        } else { \
+            bool* is_null = batch->column(_column_id)->is_null(); \
+            if (batch->selected_in_use()) { \
+                for (uint16_t j = 0; j !=n; ++j) { \
+                    uint16_t i = sel[j]; \
+                    sel[new_size] = i; \
+                    new_size += (!is_null[i] && (col_vector[i] OP _value)); \
+                } \
+                batch->set_size(new_size); \
+            } else { \
+                for (uint16_t i = 0; i !=n; ++i) { \
+                    sel[new_size] = i; \
+                    new_size += (!is_null[i] && (col_vector[i] OP _value)); \
+                } \
+                if (new_size < n) { \
+                    batch->set_size(new_size); \
+                    batch->set_selected_in_use(true); \
+                } \
+            } \
+        } \
+    } \
+
+
+COMPARISON_PRED_EVALUATE(EqualPredicate, ==)
+COMPARISON_PRED_EVALUATE(NotEqualPredicate, !=)
+COMPARISON_PRED_EVALUATE(LessPredicate, <)
+COMPARISON_PRED_EVALUATE(LessEqualPredicate, <=)
+COMPARISON_PRED_EVALUATE(GreaterPredicate, >)
+COMPARISON_PRED_EVALUATE(GreaterEqualPredicate, >=)
+
+#define COMPARISON_PRED_CONSTRUCTOR_DECLARATION(CLASS) \
+    template CLASS::CLASS(int column_id, const int8_t& value); \
+    template CLASS::CLASS(int column_id, const int16_t& value); \
+    template CLASS::CLASS(int column_id, const int32_t& value); \
+    template CLASS::CLASS(int column_id, const int64_t& value); \
+    template CLASS::CLASS(int column_id, const int128_t& value); \
+    template CLASS::CLASS(int column_id, const float& value); \
+    template CLASS::CLASS(int column_id, const double& value); \
+    template CLASS::CLASS(int column_id, const decimal12_t& value); \
+    template CLASS::CLASS(int column_id, const StringValue& value); \
+    template CLASS::CLASS(int column_id, const uint24_t& value); \
+    template CLASS::CLASS(int column_id, const uint64_t& value); \
+
+COMPARISON_PRED_CONSTRUCTOR_DECLARATION(EqualPredicate)
+COMPARISON_PRED_CONSTRUCTOR_DECLARATION(NotEqualPredicate)
+COMPARISON_PRED_CONSTRUCTOR_DECLARATION(LessPredicate)
+COMPARISON_PRED_CONSTRUCTOR_DECLARATION(LessEqualPredicate)
+COMPARISON_PRED_CONSTRUCTOR_DECLARATION(GreaterPredicate)
+COMPARISON_PRED_CONSTRUCTOR_DECLARATION(GreaterEqualPredicate)
+
+#define COMPARISON_PRED_EVALUATE_DECLARATION(CLASS) \
+    template void CLASS::evaluate(VectorizedRowBatch* batch) const; \
+    template void CLASS::evaluate(VectorizedRowBatch* batch) const; \
+    template void CLASS::evaluate(VectorizedRowBatch* batch) const; \
+    template void CLASS::evaluate(VectorizedRowBatch* batch) const; \
+    template void CLASS::evaluate(VectorizedRowBatch* batch) const; \
+    template void CLASS::evaluate(VectorizedRowBatch* batch) const; \
+    template void CLASS::evaluate(VectorizedRowBatch* batch) const; \
+    template void CLASS::evaluate(VectorizedRowBatch* batch) const; \
+    template void CLASS::evaluate(VectorizedRowBatch* batch) const; \
+    template void CLASS::evaluate(VectorizedRowBatch* batch) const; \
+    template void CLASS::evaluate(VectorizedRowBatch* batch) const; \
+
+COMPARISON_PRED_EVALUATE_DECLARATION(EqualPredicate)
+COMPARISON_PRED_EVALUATE_DECLARATION(NotEqualPredicate)
+COMPARISON_PRED_EVALUATE_DECLARATION(LessPredicate)
+COMPARISON_PRED_EVALUATE_DECLARATION(LessEqualPredicate)
+COMPARISON_PRED_EVALUATE_DECLARATION(GreaterPredicate)
+COMPARISON_PRED_EVALUATE_DECLARATION(GreaterEqualPredicate)
+
+} //namespace palo
diff --git a/be/src/olap/comparison_predicate.h b/be/src/olap/comparison_predicate.h
new file mode 100644
index 0000000000..2d5c10e31d
--- /dev/null
+++ b/be/src/olap/comparison_predicate.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef BDG_PALO_BE_SRC_OLAP_COMPARISON_PREDICATE_H
+#define BDG_PALO_BE_SRC_OLAP_COMPARISON_PREDICATE_H
+
+#include 
+#include "olap/column_predicate.h"
+
+namespace palo {
+
+class VectorizedRowBatch;
+
+#define COMPARISON_PRED_CLASS_DEFINE(CLASS) \
+    template  \
+    class CLASS : public ColumnPredicate { \
+    public: \
+        CLASS(int column_id, const type& value); \
+        virtual ~CLASS() { }  \
+        virtual void evaluate(VectorizedRowBatch* batch) const override; \
+    private: \
+        int32_t _column_id; \
+        type _value; \
+    }; \
+
+COMPARISON_PRED_CLASS_DEFINE(EqualPredicate)
+COMPARISON_PRED_CLASS_DEFINE(NotEqualPredicate)
+COMPARISON_PRED_CLASS_DEFINE(LessPredicate)
+COMPARISON_PRED_CLASS_DEFINE(LessEqualPredicate)
+COMPARISON_PRED_CLASS_DEFINE(GreaterPredicate)
+COMPARISON_PRED_CLASS_DEFINE(GreaterEqualPredicate)
+
+} //namespace palo
+
+#endif //BDG_PALO_BE_SRC_OLAP_COMPARISON_PREDICATE_H
diff --git a/be/src/olap/cumulative_handler.cpp b/be/src/olap/cumulative_compaction.cpp
similarity index 88%
rename from be/src/olap/cumulative_handler.cpp
rename to be/src/olap/cumulative_compaction.cpp
index c553af31e8..e1019ce430 100755
--- a/be/src/olap/cumulative_handler.cpp
+++ b/be/src/olap/cumulative_compaction.cpp
@@ -13,7 +13,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include "olap/cumulative_handler.h"
+#include "olap/cumulative_compaction.h"
 
 #include 
 #include 
@@ -30,8 +30,8 @@ using std::vector;
 
 namespace palo {
 
-OLAPStatus CumulativeHandler::init(SmartOLAPTable table) {
-    OLAP_LOG_TRACE("init cumulative expansion handler. [table=%s]", table->full_name().c_str());
+OLAPStatus CumulativeCompaction::init(SmartOLAPTable table) {
+    OLAP_LOG_TRACE("init cumulative compaction handler. [table=%s]", table->full_name().c_str());
 
     if (_is_init) {
         OLAP_LOG_WARNING("cumulative handler has been inited.[table=%s]",
@@ -44,7 +44,7 @@ OLAPStatus CumulativeHandler::init(SmartOLAPTable table) {
     }
 
     _table = table;
-    _max_delta_file_size = config::ce_policy_max_delta_file_size;
+    _max_delta_file_size = config::cumulative_compaction_budgeted_bytes;
 
     if (!_table->try_cumulative_lock()) {
         OLAP_LOG_WARNING("another cumulative is running. [table=%s]",
@@ -79,7 +79,7 @@ OLAPStatus CumulativeHandler::init(SmartOLAPTable table) {
     _release_header_lock();
     if (res != OLAP_SUCCESS) {
         _table->release_cumulative_lock();
-        OLAP_LOG_INFO("no suitable delta versions. don't do cumulative expansion now.");
+        OLAP_LOG_INFO("no suitable delta versions. don't do cumulative compaction now.");
         return res;
     }
 
@@ -96,7 +96,7 @@ OLAPStatus CumulativeHandler::init(SmartOLAPTable table) {
     return OLAP_SUCCESS;
 }
 
-OLAPStatus CumulativeHandler::run() {
+OLAPStatus CumulativeCompaction::run() {
     if (!_is_init) {
         _table->release_cumulative_lock();
         OLAP_LOG_WARNING("cumulative handler is not inited.");
@@ -104,7 +104,7 @@ OLAPStatus CumulativeHandler::run() {
     }
 
     // 0. 准备工作
-    OLAP_LOG_INFO("start cumulative expansion [table=%s; cumulative_version=%d-%d]",
+    OLAP_LOG_INFO("start cumulative compaction [table=%s; cumulative_version=%d-%d]",
                   _table->full_name().c_str(),
                   _cumulative_version.first,
                   _cumulative_version.second);
@@ -132,13 +132,13 @@ OLAPStatus CumulativeHandler::run() {
         return OLAP_ERR_CUMULATIVE_FAILED_ACQUIRE_DATA_SOURCE;
     }
 
-    if (PaloMetrics::ce_merge_delta_num() != NULL) {
-        PaloMetrics::ce_merge_delta_num()->increment(_need_merged_versions.size());
-        int64_t merge_size = 0;
+    {
+        PaloMetrics::cumulative_compaction_deltas_total.increment(_need_merged_versions.size());
+        int64_t merge_bytes = 0;
         for (IData* i_data : _data_source) {
-            merge_size += i_data->olap_index()->data_size();
+            merge_bytes += i_data->olap_index()->data_size();
         }
-        PaloMetrics::ce_merge_size()->increment(merge_size);
+        PaloMetrics::cumulative_compaction_bytes_total.increment(merge_bytes);
     }
 
     do {
@@ -157,10 +157,10 @@ OLAPStatus CumulativeHandler::run() {
             break;
         }
 
-        // 4. 执行cumulative expansionåˆå¹¶è¿‡ç¨‹
-        res = _do_cumulative_expansion();
+        // 4. 执行cumulative compactionåˆå¹¶è¿‡ç¨‹
+        res = _do_cumulative_compaction();
         if (res != OLAP_SUCCESS) {
-            OLAP_LOG_WARNING("failed to do cumulative expansion. "
+            OLAP_LOG_WARNING("failed to do cumulative compaction. "
                              "[table=%s; cumulative_version=%d-%d]",
                              _table->full_name().c_str(),
                              _cumulative_version.first,
@@ -181,12 +181,12 @@ OLAPStatus CumulativeHandler::run() {
 
     _table->release_cumulative_lock();
 
-    OLAP_LOG_TRACE("elapsed time of doing cumulative expansion. [time=%ldus]",
+    OLAP_LOG_TRACE("elapsed time of doing cumulative compaction. [time=%ldus]",
             watch.get_elapse_time_us());
     return res;
 }
 
-OLAPStatus CumulativeHandler::_check_whether_satisfy_policy() {
+OLAPStatus CumulativeCompaction::_check_whether_satisfy_policy() {
     OLAPStatus res = OLAP_SUCCESS;
     
     Versions delta_versions;
@@ -196,22 +196,22 @@ OLAPStatus CumulativeHandler::_check_whether_satisfy_policy() {
         return res;
     }
     
-    if (delta_versions.size() < config::ce_policy_delta_files_number) {
+    if (delta_versions.size() < config::cumulative_compaction_num_singleton_deltas) {
         OLAP_LOG_TRACE("do not satisfy cumulative policy. "
-                       "[existed_delta_file_number=%d ce_policy_delta_file_number=%d]",
+                       "[num_existed_singleton_deltas=%d cumulative_compaction_num_singleton_deltas=%d]",
                        delta_versions.size(),
-                       config::ce_policy_delta_files_number);
+                       config::cumulative_compaction_num_singleton_deltas);
         return OLAP_ERR_CUMULATIVE_NO_SUITABLE_VERSIONS;
     }
 
     OLAP_LOG_INFO("satisfy cumulative policy."
-                  "[existed_delta_file_number=%d ce_policy_delta_file_number=%d]",
+                  "[num_existed_singleton_delta=%d cumulative_compaction_num_singleton_deltas=%d]",
                   delta_versions.size(),
-                  config::ce_policy_delta_files_number);
+                  config::cumulative_compaction_num_singleton_deltas);
     return OLAP_SUCCESS;
 }
 
-OLAPStatus CumulativeHandler::_calculate_need_merged_versions() {
+OLAPStatus CumulativeCompaction::_calculate_need_merged_versions() {
     OLAPStatus res = OLAP_SUCCESS;
     
     Versions delta_versions;
@@ -237,7 +237,7 @@ OLAPStatus CumulativeHandler::_calculate_need_merged_versions() {
         for (; index < delta_number; ++index) {
             // 如果已找到的å¯åˆå¹¶delta文件大å°å¤§äºŽç­‰äºŽ_max_delta_file_size,我们认为å¯ä»¥æ‰§è¡Œåˆå¹¶äº†
             // åœæ­¢æŸ¥æ‰¾è¿‡ç¨‹
-            if (total_size >= _max_delta_file_size * config::cumulative_source_overflow_ratio) {
+            if (total_size >= _max_delta_file_size) {
                 break;
             }
 
@@ -292,8 +292,8 @@ OLAPStatus CumulativeHandler::_calculate_need_merged_versions() {
             return OLAP_SUCCESS;
         }
 
-        // 如果有多个å¯åˆå¹¶æ–‡ä»¶ï¼Œåˆ™å¯ä»¥è¿›è¡Œcumulative expansionçš„åˆå¹¶è¿‡ç¨‹
-        // å¦‚æžœåªæœ‰åªæœ‰ä¸€ä¸ªå¯åˆå¹¶çš„æ–‡ä»¶ï¼Œä¸ºäº†æ•ˆçŽ‡ï¼Œä¸è§¦å‘cumulative expansionçš„åˆå¹¶è¿‡ç¨‹
+        // 如果有多个å¯åˆå¹¶æ–‡ä»¶ï¼Œåˆ™å¯ä»¥è¿›è¡Œcumulative compactionçš„åˆå¹¶è¿‡ç¨‹
+        // å¦‚æžœåªæœ‰åªæœ‰ä¸€ä¸ªå¯åˆå¹¶çš„æ–‡ä»¶ï¼Œä¸ºäº†æ•ˆçŽ‡ï¼Œä¸è§¦å‘cumulative compactionçš„åˆå¹¶è¿‡ç¨‹
         if (need_merged_versions.size() != 1) {
             // 如果在å¯åˆå¹¶åŒºé—´å¼€å¤´ä¹‹å‰çš„ä¸€ä¸ªç‰ˆæœ¬çš„å¤§å°æ²¡æœ‰è¾¾åˆ°delta文件的最大值,
             // 则将å¯åˆå¹¶åŒºé—´çš„æ–‡ä»¶åˆå¹¶åˆ°ä¹‹å‰é‚£ä¸ªç‰ˆæœ¬ä¸Š
@@ -309,9 +309,9 @@ OLAPStatus CumulativeHandler::_calculate_need_merged_versions() {
     }
     
     // 没有找到å¯ä»¥åˆå¹¶çš„delta文件,无法执行åˆå¹¶è¿‡ç¨‹ï¼Œä½†æˆ‘们ä»ç„¶éœ€è¦è®¾ç½®æ–°çš„cumulative_layer_point
-    // 如果ä¸è®¾ç½®æ–°çš„cumulative_layer_point, 则下次执行cumulative expansion时,扫æçš„æ–‡ä»¶å’Œè¿™æ¬¡
+    // 如果ä¸è®¾ç½®æ–°çš„cumulative_layer_point, 则下次执行cumulative compaction时,扫æçš„æ–‡ä»¶å’Œè¿™æ¬¡
     // 扫æçš„æ–‡ä»¶ç›¸åŒï¼Œä¾ç„¶æ‰¾ä¸åˆ°å¯ä»¥åˆå¹¶çš„delta文件, 无法执行åˆå¹¶è¿‡ç¨‹ã€‚
-    // 便­¤ç±»æŽ¨ï¼Œå°±è¿›å…¥äº†æ­»å¾ªçŽ¯çŠ¶æ€ï¼Œæ°¸è¿œä¸ä¼šè¿›è¡Œcumulative expansion
+    // 便­¤ç±»æŽ¨ï¼Œå°±è¿›å…¥äº†æ­»å¾ªçŽ¯çŠ¶æ€ï¼Œæ°¸è¿œä¸ä¼šè¿›è¡Œcumulative compaction
     _table->set_cumulative_layer_point(delta_versions[index].first);
     _table->save_header();
     return OLAP_ERR_CUMULATIVE_NO_SUITABLE_VERSIONS;
@@ -321,7 +321,7 @@ static bool version_comparator(const Version& lhs, const Version& rhs) {
     return lhs.second < rhs.second;
 }
 
-OLAPStatus CumulativeHandler::_get_delta_versions(Versions* delta_versions) {
+OLAPStatus CumulativeCompaction::_get_delta_versions(Versions* delta_versions) {
     delta_versions->clear();
     
     Versions all_versions;
@@ -353,7 +353,7 @@ OLAPStatus CumulativeHandler::_get_delta_versions(Versions* delta_versions) {
     return OLAP_SUCCESS;
 }
 
-bool CumulativeHandler::_find_previous_version(const Version current_version,
+bool CumulativeCompaction::_find_previous_version(const Version current_version,
                                                Version* previous_version) {
     Versions all_versions;
     if (OLAP_SUCCESS != _table->select_versions_to_span(Version(0, current_version.second),
@@ -390,9 +390,9 @@ bool CumulativeHandler::_find_previous_version(const Version current_version,
     return false;
 }
 
-OLAPStatus CumulativeHandler::_do_cumulative_expansion() {
+OLAPStatus CumulativeCompaction::_do_cumulative_compaction() {
     OLAPStatus res = OLAP_SUCCESS;
-    Merger merger(_table, _new_cumulative_index, READER_CUMULATIVE_EXPANSION);
+    Merger merger(_table, _new_cumulative_index, READER_CUMULATIVE_COMPACTION);
 
     // 1. merge delta files into new cumulative file
     uint64_t merged_rows = 0;
@@ -452,7 +452,7 @@ OLAPStatus CumulativeHandler::_do_cumulative_expansion() {
     // 4. validate that delete action is right
     res = _validate_delete_file_action();
     if (res != OLAP_SUCCESS) {
-        OLAP_LOG_FATAL("delete action of cumulative expansion has error. roll back."
+        OLAP_LOG_FATAL("delete action of cumulative compaction has error. roll back."
                        "[table=%s; cumulative_version=%d-%d]",
                        _table->full_name().c_str(),
                        _cumulative_version.first,
@@ -474,14 +474,14 @@ OLAPStatus CumulativeHandler::_do_cumulative_expansion() {
     // 6. delete delta files which have been merged into new cumulative file
     _delete_unused_delta_files(&unused_indices);
 
-    OLAP_LOG_INFO("succeed to do cumulative expansion. [table=%s; cumulative_version=%d-%d]",
+    OLAP_LOG_INFO("succeed to do cumulative compaction. [table=%s; cumulative_version=%d-%d]",
                   _table->full_name().c_str(),
                   _cumulative_version.first,
                   _cumulative_version.second);
     return res;
 }
 
-OLAPStatus CumulativeHandler::_update_header(vector* unused_indices) {
+OLAPStatus CumulativeCompaction::_update_header(vector* unused_indices) {
     vector new_indices;
     new_indices.push_back(_new_cumulative_index);
 
@@ -503,7 +503,7 @@ OLAPStatus CumulativeHandler::_update_header(vector* unused_indices)
     return res;    
 }
 
-void CumulativeHandler::_delete_unused_delta_files(vector* unused_indices) {
+void CumulativeCompaction::_delete_unused_delta_files(vector* unused_indices) {
     if (!unused_indices->empty()) {
         OLAPUnusedIndex* unused_index = OLAPUnusedIndex::get_instance();
 
@@ -514,7 +514,7 @@ void CumulativeHandler::_delete_unused_delta_files(vector* unused_in
     }
 }
 
-bool CumulativeHandler::_validate_need_merged_versions() {
+bool CumulativeCompaction::_validate_need_merged_versions() {
     // 1. validate versions in _need_merged_versions are continuous
     // Skip the first element
     for (unsigned int index = 1; index < _need_merged_versions.size(); ++index) {
@@ -532,7 +532,7 @@ bool CumulativeHandler::_validate_need_merged_versions() {
     return true;
 }
 
-OLAPStatus CumulativeHandler::_validate_delete_file_action() {
+OLAPStatus CumulativeCompaction::_validate_delete_file_action() {
     // 1. acquire the new cumulative version to make sure that all is right after deleting files
     Version test_version = Version(0, _cumulative_version.second);
     vector test_sources;
@@ -547,7 +547,7 @@ OLAPStatus CumulativeHandler::_validate_delete_file_action() {
     return OLAP_SUCCESS;
 }
 
-OLAPStatus CumulativeHandler::_roll_back(const vector& old_olap_indices) {
+OLAPStatus CumulativeCompaction::_roll_back(const vector& old_olap_indices) {
     vector need_remove_version;
     need_remove_version.push_back(_cumulative_version);
     // unused_indices will only contain new cumulative index
diff --git a/be/src/olap/cumulative_handler.h b/be/src/olap/cumulative_compaction.h
similarity index 85%
rename from be/src/olap/cumulative_handler.h
rename to be/src/olap/cumulative_compaction.h
index 4e2edb9e00..6e17d0d6ec 100755
--- a/be/src/olap/cumulative_handler.h
+++ b/be/src/olap/cumulative_compaction.h
@@ -13,8 +13,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#ifndef BDG_PALO_BE_SRC_OLAP_CUMULATIVE_HANDLER_H
-#define BDG_PALO_BE_SRC_OLAP_CUMULATIVE_HANDLER_H
+#ifndef BDG_PALO_BE_SRC_OLAP_CUMULATIVE_COMPACTION_H
+#define BDG_PALO_BE_SRC_OLAP_CUMULATIVE_COMPACTION_H
 
 #include 
 #include 
@@ -28,9 +28,9 @@
 
 namespace palo {
 
-class CumulativeHandler {
+class CumulativeCompaction {
 public:
-    CumulativeHandler() :
+    CumulativeCompaction() :
             _is_init(false),
             _header_locked(false),
             _old_cumulative_layer_point(0),
@@ -38,21 +38,21 @@ public:
             _max_delta_file_size(0),
             _new_cumulative_index(NULL) {}
 
-    ~CumulativeHandler() {}
+    ~CumulativeCompaction() {}
     
-    // åˆå§‹åŒ–CumulativeHandler对象,包括:
-    // - 检查是å¦è§¦å‘cumulative expansion
+    // åˆå§‹åŒ–CumulativeCompaction对象,包括:
+    // - 检查是å¦è§¦å‘cumulative compaction
     // - 计算å¯åˆå¹¶çš„delta文件
     //
     // è¾“å…¥å‚æ•°ï¼š
-    // - table 待执行cumulative expansion的olap table
+    // - table 待执行cumulative compaction的olap table
     //
     // 返回值:
-    // - 如果触å‘cumulative expansion,返回OLAP_SUCCESS
+    // - 如果触å‘cumulative compaction,返回OLAP_SUCCESS
     // - å¦åˆ™ï¼Œè¿”回对应错误ç 
     OLAPStatus init(SmartOLAPTable table);
 
-    // 执行cumulative expansion
+    // 执行cumulative compaction
     //
     // 返回值:
     // - 如果执行æˆåŠŸï¼Œè¿”å›žOLAP_SUCCESS
@@ -60,7 +60,7 @@ public:
     OLAPStatus run();
     
 private:
-    // æ£€æŸ¥æ˜¯å¦æ»¡è¶³cumulative expansion触å‘ç­–ç•¥
+    // æ£€æŸ¥æ˜¯å¦æ»¡è¶³cumulative compaction触å‘ç­–ç•¥
     //
     // 返回值:
     // - 如果满足,返回OLAP_SUCCESS
@@ -97,12 +97,12 @@ private:
     // - 如果查找失败,返回false
     bool _find_previous_version(const Version current_version, Version* previous_version);
 
-    // 执行cumulative expansionåˆå¹¶è¿‡ç¨‹
+    // 执行cumulative compactionåˆå¹¶è¿‡ç¨‹
     //
     // 返回值:
     // - 如果æˆåŠŸï¼Œè¿”å›žOLAP_SUCCESS
     // - å¦‚æžœä¸æˆåŠŸï¼Œè¿”å›žç›¸åº”é”™è¯¯ç 
-    OLAPStatus _do_cumulative_expansion();
+    OLAPStatus _do_cumulative_compaction();
 
     // å°†åˆå¹¶å¾—到的新cumulative文件载入table
     //
@@ -154,18 +154,18 @@ private:
         }
     }
 
-    // CumulativeHandler对象是å¦åˆå§‹åŒ–
+    // CumulativeCompaction对象是å¦åˆå§‹åŒ–
     bool _is_init;
     // header文件是å¦åŠ é”
     bool _header_locked;
     // table现有的cumulative层的标识点
     int32_t _old_cumulative_layer_point;
-    // å¾…cumulative expansion完æˆä¹‹åŽï¼Œæ–°çš„cumulative层的标识点
+    // å¾…cumulative compaction完æˆä¹‹åŽï¼Œæ–°çš„cumulative层的标识点
     int32_t _new_cumulative_layer_point;
     // 一个cumulative文件大å°çš„æœ€å¤§å€¼
     // 当delta文件的大å°è¶…过该值时,我们认为该delta文件是cumulative文件
     size_t _max_delta_file_size;
-    // 待执行cumulative expansion的olap table
+    // 待执行cumulative compaction的olap table
     SmartOLAPTable _table;
     // 新cumulative文件的版本
     Version _cumulative_version;
@@ -178,9 +178,9 @@ private:
     // å¯åˆå¹¶çš„delta文件的版本
     std::vector _need_merged_versions;
 
-    DISALLOW_COPY_AND_ASSIGN(CumulativeHandler);
+    DISALLOW_COPY_AND_ASSIGN(CumulativeCompaction);
 };
 
 }  // namespace palo
 
-#endif // BDG_PALO_BE_SRC_OLAP_CUMULATIVE_HANDLER_H
+#endif // BDG_PALO_BE_SRC_OLAP_CUMULATIVE_COMPACTION_H
diff --git a/be/src/olap/field.cpp b/be/src/olap/field.cpp
index a57d67f67e..96eb7409e9 100644
--- a/be/src/olap/field.cpp
+++ b/be/src/olap/field.cpp
@@ -23,1066 +23,51 @@
 using std::map;
 using std::nothrow;
 using std::string;
-using std::stringstream;
 
 namespace palo {
 
-FieldType FieldInfo::get_field_type_by_string(const string& type_str) {
-    string upper_type_str = type_str;
-    std::transform(type_str.begin(), type_str.end(), upper_type_str.begin(), toupper);
-    FieldType type;
-
-    if (0 == upper_type_str.compare("TINYINT")) {
-        type = OLAP_FIELD_TYPE_TINYINT;
-    } else if (0 == upper_type_str.compare("SMALLINT")) {
-        type = OLAP_FIELD_TYPE_SMALLINT;
-    } else if (0 == upper_type_str.compare("INT")) {
-        type = OLAP_FIELD_TYPE_INT;
-    } else if (0 == upper_type_str.compare("BIGINT")) {
-        type = OLAP_FIELD_TYPE_BIGINT;
-    } else if (0 == upper_type_str.compare("LARGEINT")) {
-        type = OLAP_FIELD_TYPE_LARGEINT;
-    } else if (0 == upper_type_str.compare("UNSIGNED_TINYINT")) {
-        type = OLAP_FIELD_TYPE_UNSIGNED_TINYINT;
-    } else if (0 == upper_type_str.compare("UNSIGNED_SMALLINT")) {
-        type = OLAP_FIELD_TYPE_UNSIGNED_SMALLINT;
-    } else if (0 == upper_type_str.compare("UNSIGNED_INT")) {
-        type = OLAP_FIELD_TYPE_UNSIGNED_INT;
-    } else if (0 == upper_type_str.compare("UNSIGNED_BIGINT")) {
-        type = OLAP_FIELD_TYPE_UNSIGNED_BIGINT;
-    } else if (0 == upper_type_str.compare("FLOAT")) {
-        type = OLAP_FIELD_TYPE_FLOAT;
-    } else if (0 == upper_type_str.compare("DISCRETE_DOUBLE")) {
-        type = OLAP_FIELD_TYPE_DISCRETE_DOUBLE;
-    } else if (0 == upper_type_str.compare("DOUBLE")) {
-        type = OLAP_FIELD_TYPE_DOUBLE;
-    } else if (0 == upper_type_str.compare("CHAR")) {
-        type = OLAP_FIELD_TYPE_CHAR;
-    } else if (0 == upper_type_str.compare("DATE")) {
-        type = OLAP_FIELD_TYPE_DATE;
-    } else if (0 == upper_type_str.compare("DATETIME")) {
-        type = OLAP_FIELD_TYPE_DATETIME;
-    } else if (0 == upper_type_str.compare(0, 7, "DECIMAL")) {
-        type = OLAP_FIELD_TYPE_DECIMAL;
-    } else if (0 == upper_type_str.compare(0, 7, "VARCHAR")) {
-        type = OLAP_FIELD_TYPE_VARCHAR;
-    } else if (0 == upper_type_str.compare(0, 3, "HLL")) {
-        type = OLAP_FIELD_TYPE_HLL;
-    } else if (0 == upper_type_str.compare("STRUCT")) {
-        type = OLAP_FIELD_TYPE_STRUCT;
-    } else if (0 == upper_type_str.compare("LIST")) {
-        type = OLAP_FIELD_TYPE_LIST;
-    } else if (0 == upper_type_str.compare("MAP")) {
-        type = OLAP_FIELD_TYPE_MAP;
-    } else {
-        OLAP_LOG_WARNING("invalid type string. [type='%s']", type_str.c_str());
-        type = OLAP_FIELD_TYPE_UNKNOWN;
-    }
-
-    return type;
-}
-
-FieldAggregationMethod FieldInfo::get_aggregation_type_by_string(const string& str) {
-    string upper_str = str;
-    std::transform(str.begin(), str.end(), upper_str.begin(), toupper);
-    FieldAggregationMethod aggregation_type;
-
-    if (0 == upper_str.compare("NONE")) {
-        aggregation_type = OLAP_FIELD_AGGREGATION_NONE;
-    } else if (0 == upper_str.compare("SUM")) {
-        aggregation_type = OLAP_FIELD_AGGREGATION_SUM;
-    } else if (0 == upper_str.compare("MIN")) {
-        aggregation_type = OLAP_FIELD_AGGREGATION_MIN;
-    } else if (0 == upper_str.compare("MAX")) {
-        aggregation_type = OLAP_FIELD_AGGREGATION_MAX;
-    } else if (0 == upper_str.compare("REPLACE")) {
-        aggregation_type = OLAP_FIELD_AGGREGATION_REPLACE;
-    } else if (0 == upper_str.compare("HLL_UNION")) {
-        aggregation_type = OLAP_FIELD_AGGREGATION_HLL_UNION;
-    } else {
-        OLAP_LOG_WARNING("invalid aggregation type string. [aggregation='%s']", str.c_str());
-        aggregation_type = OLAP_FIELD_AGGREGATION_UNKNOWN;
-    }
-
-    return aggregation_type;
-}
-
-string FieldInfo::get_string_by_field_type(FieldType type) {
-    switch (type) {
-    case OLAP_FIELD_TYPE_TINYINT:
-        return "TINYINT";
-
-    case OLAP_FIELD_TYPE_UNSIGNED_TINYINT:
-        return "UNSIGNED_TINYINT";
-
-    case OLAP_FIELD_TYPE_SMALLINT:
-        return "SMALLINT";
-
-    case OLAP_FIELD_TYPE_UNSIGNED_SMALLINT:
-        return "UNSIGNED_SMALLINT";
-
-    case OLAP_FIELD_TYPE_INT:
-        return "INT";
-
-    case OLAP_FIELD_TYPE_UNSIGNED_INT:
-        return "UNSIGNED_INT";
-
-    case OLAP_FIELD_TYPE_BIGINT:
-        return "BIGINT";
-
-    case OLAP_FIELD_TYPE_LARGEINT:
-        return "LARGEINT";
-
-    case OLAP_FIELD_TYPE_UNSIGNED_BIGINT:
-        return "UNSIGNED_BIGINT";
-
-    case OLAP_FIELD_TYPE_FLOAT:
-        return "FLOAT";
-
-    case OLAP_FIELD_TYPE_DOUBLE:
-        return "DOUBLE";
-
-    case OLAP_FIELD_TYPE_DISCRETE_DOUBLE:
-        return "DISCRETE_DOUBLE";
-
-    case OLAP_FIELD_TYPE_CHAR:
-        return "CHAR";
-
-    case OLAP_FIELD_TYPE_DATE:
-        return "DATE";
-
-    case OLAP_FIELD_TYPE_DATETIME:
-        return "DATETIME";
-
-    case OLAP_FIELD_TYPE_DECIMAL:
-        return "DECIMAL";
-
-    case OLAP_FIELD_TYPE_VARCHAR:
-        return "VARCHAR";
-    
-    case OLAP_FIELD_TYPE_HLL:
-        return "HLL";
-
-    case OLAP_FIELD_TYPE_STRUCT:
-        return "STRUCT";
-
-    case OLAP_FIELD_TYPE_LIST:
-        return "LIST";
-
-    case OLAP_FIELD_TYPE_MAP:
-        return "MAP";
-
-    default:
-        return "UNKNOWN";
-    }
-}
-
-string FieldInfo::get_string_by_aggregation_type(FieldAggregationMethod type) {
-    switch (type) {
-    case OLAP_FIELD_AGGREGATION_NONE:
-        return "NONE";
-
-    case OLAP_FIELD_AGGREGATION_SUM:
-        return "SUM";
-
-    case OLAP_FIELD_AGGREGATION_MIN:
-        return "MIN";
-
-    case OLAP_FIELD_AGGREGATION_MAX:
-        return "MAX";
-
-    case OLAP_FIELD_AGGREGATION_REPLACE:
-        return "REPLACE";
-            
-    case OLAP_FIELD_AGGREGATION_HLL_UNION:
-        return "HLL_UNION";
-
-    default:
-        return "UNKNOWN";
-    }
-}
-
-uint32_t FieldInfo::get_field_length_by_type(TPrimitiveType::type type, uint32_t string_length) {
-    switch (type) {
-    case TPrimitiveType::TINYINT:
-        return 1;
-    case TPrimitiveType::SMALLINT:
-        return 2;
-    case TPrimitiveType::INT:
-        return 4;
-    case TPrimitiveType::BIGINT:
-        return 8;
-    case TPrimitiveType::LARGEINT:
-        return 16;
-    case TPrimitiveType::DATE:
-        return 3;
-    case TPrimitiveType::DATETIME:
-        return 8;
-    case TPrimitiveType::FLOAT:
-        return 4;
-    case TPrimitiveType::DOUBLE:
-        return 8;
-    case TPrimitiveType::CHAR:
-        return string_length;
-    case TPrimitiveType::VARCHAR:
-    case TPrimitiveType::HLL:
-        return string_length + sizeof(VarCharField::LengthValueType);
-    case TPrimitiveType::DECIMAL:    
-        return 12; // use 12 bytes in olap engine.
-    default:
-        OLAP_LOG_WARNING("unknown field type. [type=%d]", type);
-        return 0;
-    }
-}
-
 Field* Field::create(const FieldInfo& field_info) {
-    Field* field = NULL;
-
-    switch (field_info.type) {
-    case OLAP_FIELD_TYPE_TINYINT:
-        field = new(nothrow) NumericField(field_info);
-        break;
-
-    case OLAP_FIELD_TYPE_SMALLINT:
-        field = new(nothrow) NumericField(field_info);
-        break;
-
-    case OLAP_FIELD_TYPE_INT:
-        field = new(nothrow) NumericField(field_info);
-        break;
-
-    case OLAP_FIELD_TYPE_BIGINT:
-        field = new(nothrow) NumericField(field_info);
-        break;
-
-    case OLAP_FIELD_TYPE_LARGEINT:
-        field = new(nothrow) NumericField(field_info);
-        break;
-
-    case OLAP_FIELD_TYPE_UNSIGNED_TINYINT:
-        field = new(nothrow) NumericField(field_info);
-        break;
-
-    case OLAP_FIELD_TYPE_UNSIGNED_SMALLINT:
-        field = new(nothrow) NumericField(field_info);
-        break;
-
-    case OLAP_FIELD_TYPE_UNSIGNED_INT:
-        field = new(nothrow) NumericField(field_info);
-        break;
-
-    case OLAP_FIELD_TYPE_UNSIGNED_BIGINT:
-        field = new(nothrow) NumericField(field_info);
-        break;
-
-    case OLAP_FIELD_TYPE_FLOAT:
-        field = new(nothrow) NumericField(field_info);
-        break;
-
-    case OLAP_FIELD_TYPE_DISCRETE_DOUBLE:
-        field = new(nothrow) DiscreteDoubleField(field_info);
-        break;
-
-    case OLAP_FIELD_TYPE_DOUBLE:
-        field = new(nothrow) NumericField(field_info);
-        break;
-
-    case OLAP_FIELD_TYPE_CHAR:
-        field = new(nothrow) CharField(field_info);
-        break;
-
-    case OLAP_FIELD_TYPE_DATE:
-        field = new(nothrow) DateField(field_info);
-        break;
-
-    case OLAP_FIELD_TYPE_DATETIME:
-        field = new(nothrow) DateTimeField(field_info);
-        break;
-
-    case OLAP_FIELD_TYPE_DECIMAL:
-        field = new(nothrow) DecimalField(field_info);
-        break;
-
-    case OLAP_FIELD_TYPE_VARCHAR:
-        field = new(nothrow) VarCharField(field_info);
-        break;
-    case OLAP_FIELD_TYPE_HLL:
-        field = new(nothrow) HllField(field_info);
-        break;
-    case OLAP_FIELD_TYPE_STRUCT:
-    case OLAP_FIELD_TYPE_LIST:
-    case OLAP_FIELD_TYPE_MAP:
-        OLAP_LOG_WARNING("unsupported field type. [type='%s']",
-                         FieldInfo::get_string_by_field_type(field_info.type).c_str());
-        return NULL;
-
-    case OLAP_FIELD_TYPE_UNKNOWN:
-    default:
-        OLAP_LOG_WARNING("unknown field type. [type=%d]", field_info.type);
-        return NULL;
-    }
-
-    if (NULL == field) {
-        OLAP_LOG_WARNING("fail to malloc Field.");
-        return NULL;
-    }
-
-    field->_field_type = field_info.type;
-
-    if (field->init() != OLAP_SUCCESS) {
-        OLAP_LOG_WARNING("fail to init Field.");
-        SAFE_DELETE(field);
-        return NULL;
-    }
-
+    Field* field = new Field(field_info);
     return field;
 }
 
 // 这个函数目å‰ä¸æ”¯æŒå­—符串类型
-Field* Field::create_by_type(const FieldType& field_type) {
+Field* Field::create_by_type(const FieldType& type) {
     Field* field = NULL;
     FieldInfo field_info;
     field_info.aggregation = OLAP_FIELD_AGGREGATION_NONE;
+    field_info.type = type;
 
-    switch (field_type) {
-    case OLAP_FIELD_TYPE_TINYINT:
-        field = new(nothrow) NumericField(field_info);
-        break;
-
-    case OLAP_FIELD_TYPE_SMALLINT:
-        field = new(nothrow) NumericField(field_info);
-        break;
-
-    case OLAP_FIELD_TYPE_INT:
-        field = new(nothrow) NumericField(field_info);
-        break;
-
-    case OLAP_FIELD_TYPE_BIGINT:
-        field = new(nothrow) NumericField(field_info);
-        break;
-
-    case OLAP_FIELD_TYPE_LARGEINT:
-        field = new(nothrow) NumericField(field_info);
-        break;
-
-    case OLAP_FIELD_TYPE_UNSIGNED_TINYINT:
-        field = new(nothrow) NumericField(field_info);
-        break;
-
-    case OLAP_FIELD_TYPE_UNSIGNED_SMALLINT:
-        field = new(nothrow) NumericField(field_info);
-        break;
-
-    case OLAP_FIELD_TYPE_UNSIGNED_INT:
-        field = new(nothrow) NumericField(field_info);
-        break;
-
-    case OLAP_FIELD_TYPE_UNSIGNED_BIGINT:
-        field = new(nothrow) NumericField(field_info);
-        break;
-
-    case OLAP_FIELD_TYPE_FLOAT:
-        field = new(nothrow) NumericField(field_info);
-        break;
-
-    case OLAP_FIELD_TYPE_DISCRETE_DOUBLE:
-        field = new(nothrow) DiscreteDoubleField(field_info);
-        break;
-
-    case OLAP_FIELD_TYPE_DOUBLE:
-        field = new(nothrow) NumericField(field_info);
-        break;
-
-    case OLAP_FIELD_TYPE_DATE:
-        field = new(nothrow) DateField(field_info);
-        break;
-
-    case OLAP_FIELD_TYPE_DATETIME:
-        field = new(nothrow) DateTimeField(field_info);
-        break;
-
-    case OLAP_FIELD_TYPE_DECIMAL:
-        field = new(nothrow) DecimalField(field_info);
-        break;
-
-    case OLAP_FIELD_TYPE_STRUCT:
-    case OLAP_FIELD_TYPE_LIST:
-    case OLAP_FIELD_TYPE_MAP:
-        OLAP_LOG_DEBUG("not supported field type. [type=%d]", field_type);
-        return NULL;
-
-    case OLAP_FIELD_TYPE_CHAR:
-    case OLAP_FIELD_TYPE_VARCHAR:
-    case OLAP_FIELD_TYPE_HLL:
-        return NULL;
-
-    case OLAP_FIELD_TYPE_UNKNOWN:
-    default:
-        OLAP_LOG_DEBUG("not supported field type. [type=%d]", field_type);
-        return NULL;
-    }
-
-    if (NULL == field) {
-        OLAP_LOG_WARNING("fail to malloc Field.");
-        return NULL;
-    }
-
-    field->_field_type = field_type;
-
-    if (field->init() != OLAP_SUCCESS) {
-        OLAP_LOG_WARNING("fail to init Field.");
-        SAFE_DELETE(field);
-        return NULL;
+    if (type == OLAP_FIELD_TYPE_CHAR || type == OLAP_FIELD_TYPE_VARCHAR
+            || type == OLAP_FIELD_TYPE_HLL) {
+        field = NULL;
+    } else {
+        field = new Field(field_info);
     }
 
     return field;
 }
 
-template 
-OLAPStatus BaseField::init() {
-    switch (_aggregation) {
-    case OLAP_FIELD_AGGREGATION_SUM:
-        _aggregator = new(nothrow) FieldAddAggregator();
-        break;
+Field::Field(const FieldInfo& field_info)
+        : _type(field_info.type),
+          _index_size(field_info.index_length),
+          _offset(0) {
 
-    case OLAP_FIELD_AGGREGATION_MIN:
-        _aggregator = new(nothrow) FieldMinAggregator();
-        break;
-
-    case OLAP_FIELD_AGGREGATION_MAX:
-        _aggregator = new(nothrow) FieldMaxAggregator();
-        break;
-
-    case OLAP_FIELD_AGGREGATION_REPLACE:
-        _aggregator = new(nothrow) FieldReplaceAggregator();
-        break;
-
-    case OLAP_FIELD_AGGREGATION_NONE:
-        _aggregator = new(nothrow) FieldNoneAggregator();
-        break;
-    case OLAP_FIELD_AGGREGATION_HLL_UNION:
-        _aggregator = new(nothrow) FieldHllUnionAggreator();
-        break;
-    case OLAP_FIELD_AGGREGATION_UNKNOWN:
-    default:
-        OLAP_LOG_WARNING("unknown aggregation method, use FieldAddAggregator for default."
-                         " [aggregation=%d]", _aggregation);
-        return OLAP_ERR_OTHER_ERROR;
-    }
-
-    if (NULL == _aggregator) {
-        OLAP_LOG_WARNING("fail to malloc aggregator.");
-        return OLAP_ERR_MALLOC_ERROR;
-    }
-
-    return OLAP_SUCCESS;
-}
-
-// 䏋颿˜¯ä¸€äº›ç³»åˆ—类型转æ¢å‡½æ•°
-// æ‰€æœ‰çš„å‡½æ•°å‡æ²¡æœ‰è€ƒè™‘对异常情况的处ç†
-// 对于strto*å’Œato*系列函数,除éžä¼ å…¥çš„䏿˜¯åˆæ³•çš„C风格字符串
-// 例如是NULL或者没有以\0结尾,å¦åˆ™ä¸ä¼šå¯¼è‡´ä¸¥é‡é”™è¯¯çš„å‘生
-template <>
-OLAPStatus BaseField::from_string(const std::string& value_string) {
-    int8_t value = 0;
-
-    if (value_string.length() > 0) {
-        value = static_cast(strtol(value_string.c_str(), NULL, 10));
-    }
-
-    from_storage(reinterpret_cast(&value));
-
-    return OLAP_SUCCESS;
-}
-
-template <>
-OLAPStatus BaseField::from_string(const std::string& value_string) {
-    int16_t value = 0;
-
-    if (value_string.length() > 0) {
-        value = static_cast(strtol(value_string.c_str(), NULL, 10));
-    }
-
-    from_storage(reinterpret_cast(&value));
-
-    return OLAP_SUCCESS;
-}
-
-template <>
-OLAPStatus BaseField::from_string(const std::string& value_string) {
-    int value = 0;
-
-    if (value_string.length() > 0) {
-        value = static_cast(strtol(value_string.c_str(), NULL, 10));
-    }
-
-    from_storage(reinterpret_cast(&value));
-
-    return OLAP_SUCCESS;
-}
-
-template <>
-OLAPStatus BaseField::from_string(const std::string& value_string) {
-    int64_t value = 0;
-
-    if (value_string.length() > 0) {
-        value = strtol(value_string.c_str(), NULL, 10);
-    }
-
-    from_storage(reinterpret_cast(&value));
-
-    return OLAP_SUCCESS;
-}
-
-template <>
-OLAPStatus BaseField::from_string(const std::string& str) {
-    int128_t value = 0;
-
-    const char* value_string = str.c_str();
-    char* end = NULL;
-    value = strtol(value_string, &end, 10);
-    if (*end != 0) {
-        value = 0;
-    } else if (value > LONG_MIN && value < LONG_MAX) {
-        // use strtol result directly
+    _type_info = get_type_info(field_info.type);
+    if (_type == OLAP_FIELD_TYPE_CHAR || _type == OLAP_FIELD_TYPE_VARCHAR
+            || _type == OLAP_FIELD_TYPE_HLL) {
+        _size = sizeof(StringSlice);
     } else {
-        bool is_negative = false;
-        if (*value_string == '-' || *value_string == '+') {
-            if (*(value_string++) == '-') {
-                is_negative = true;
-            }
-        }
-
-        uint128_t current = 0;
-        uint128_t max_int128 = ~((int128_t)(1) << 127);
-        while (*value_string != 0) {
-            if (current > max_int128 / 10) {
-                break;
-            }
-
-            current = current * 10 + (*(value_string++) - '0');
-        }
-
-        if (*value_string != 0
-                || (!is_negative && current > max_int128)
-                || ( is_negative&& current > max_int128 + 1)) {
-            current = 0;
-        }
-
-        value = is_negative ? -current : current;
+        /*
+         * the field_info.size and field_info.index_length is equal to zero,
+         * if field_info is generated by Field::create_by_type function.
+         * ColumnStatistics use size but not index_size.
+         */
+        _size = _type_info->size();
     }
-
-    from_storage(reinterpret_cast(&value));
-
-    return OLAP_SUCCESS;
+    _index_size = field_info.index_length;
+    _aggregate_func = get_aggregate_func(field_info.aggregation, field_info.type);
+    _finalize_func = get_finalize_func(field_info.aggregation, field_info.type);
 }
 
-template <>
-OLAPStatus BaseField::from_string(const std::string& value_string) {
-    uint32_t value = 0;
-
-    if (value_string.length() > 0) {
-        value = static_cast(strtoul(value_string.c_str(), NULL, 10));
-    }
-
-    from_storage(reinterpret_cast(&value));
-
-    return OLAP_SUCCESS;
-}
-
-template <>
-OLAPStatus BaseField::from_string(const std::string& value_string) {
-    uint64_t value = 0;
-
-    if (value_string.length() > 0) {
-        value = strtoul(value_string.c_str(), NULL, 10);
-    }
-
-    from_storage(reinterpret_cast(&value));
-
-    return OLAP_SUCCESS;
-}
-
-template <>
-OLAPStatus BaseField::from_string(const std::string& value_string) {
-    uint16_t value = 0;
-
-    if (value_string.length() > 0) {
-        value = static_cast(strtoul(value_string.c_str(), NULL, 10));
-    }
-
-    from_storage(reinterpret_cast(&value));
-
-    return OLAP_SUCCESS;
-}
-
-template <>
-OLAPStatus BaseField::from_string(const std::string& value_string) {
-    uint8_t value = 0;
-
-    if (value_string.length() > 0) {
-        value = static_cast(strtoul(value_string.c_str(), NULL, 10));
-    }
-
-    from_storage(reinterpret_cast(&value));
-
-    return OLAP_SUCCESS;
-}
-
-template <>
-OLAPStatus BaseField::from_string(const std::string& value_string) {
-    float value = 0.0f;
-
-    if (value_string.length() > 0) {
-        value = static_cast(atof(value_string.c_str()));
-    }
-
-    from_storage(reinterpret_cast(&value));
-
-    return OLAP_SUCCESS;
-}
-
-template <>
-OLAPStatus BaseField::from_string(const std::string& value_string) {
-    double value = 0.0;
-
-    if (value_string.length() > 0) {
-        value = atof(value_string.c_str());
-    }
-
-    from_storage(reinterpret_cast(&value));
-
-    return OLAP_SUCCESS;
-}
-
-template 
-string BaseField::to_string() const {
-    stringstream stream;
-    stream << *_value();
-    return stream.str();
-}
-
-template 
-string BaseField::to_buf() const {
-    stringstream stream;
-    stream << *_value();
-    return stream.str();
-}
-
-template <>
-string BaseField::to_string() const {
-    char buf[1024] = {'\0'};
-    snprintf(buf, sizeof(buf), "%u", *_value());
-    return string(buf);
-}
-
-template <>
-string BaseField::to_buf() const {
-    char buf[1024] = {'\0'};
-    snprintf(buf, sizeof(buf), "%u", *_value());
-    return string(buf);
-}
-
-template <>
-string BaseField::to_string() const {
-    char buf[1024] = {'\0'};
-    snprintf(buf, sizeof(buf), "%d", *_value());
-    return string(buf);
-}
-
-template <>
-string BaseField::to_buf() const {
-    char buf[1024] = {'\0'};
-    snprintf(buf, sizeof(buf), "%d", *_value());
-    return string(buf);
-}
-
-template <>
-string BaseField::to_string() const {
-    char buf[1024] = {'\0'};
-    snprintf(buf, sizeof(buf), "%.10f", *_value());
-    return string(buf);
-}
-
-template <>
-string BaseField::to_buf() const {
-    char buf[1024] = {'\0'};
-    snprintf(buf, sizeof(buf), "%.10f", *_value());
-    return string(buf);
-}
-
-template <>
-string BaseField::to_string() const {
-    decimal12_t* data_ptr = _value();
-    return data_ptr->to_string();
-}
-
-template <>
-string BaseField::to_buf() const {
-    decimal12_t* data_ptr = _value();
-    return data_ptr->to_string();
-}
-
-template <>
-string BaseField::to_string() const {
-    char buf[1024];
-    int128_t value = *_value();
-    if (value >= std::numeric_limits::min()
-            && value <= std::numeric_limits::max()) {
-        snprintf(buf, sizeof(buf), "%ld", (int64_t)value);
-    } else {
-        char* current = buf;
-        uint128_t abs_value = value;
-        if (value < 0) {
-            *(current++) = '-';
-            abs_value = -value;
-        }
-
-        // the max value of uint64_t is 18446744073709551615UL,
-        // so use Z19_UINT64 to divide uint128_t 
-        const static uint64_t Z19_UINT64 = 10000000000000000000ULL;
-        uint64_t suffix = abs_value % Z19_UINT64;
-        uint64_t middle = abs_value / Z19_UINT64 % Z19_UINT64;
-        uint64_t prefix = abs_value / Z19_UINT64 / Z19_UINT64;
-
-        char* end = buf + sizeof(buf);
-        if (prefix > 0) {
-            current += snprintf(current, end - current, "%" PRIu64, prefix);
-            current += snprintf(current, end - current, "%.19" PRIu64, middle);
-            current += snprintf(current, end - current, "%.19" PRIu64, suffix);
-        } else if (OLAP_LIKELY(middle > 0)) {
-            current += snprintf(current, end - current, "%" PRIu64, middle);
-            current += snprintf(current, end - current, "%.19" PRIu64, suffix);
-        } else {
-            current += snprintf(current, end - current, "%" PRIu64, suffix);
-        }
-    }
-
-    return std::string(buf);
-}
-
-template <>
-string BaseField::to_buf() const {
-    return to_string();
-}
-
-template <>
-void NumericField::set_to_min() {
-    int128_t min_value = (int128_t)(1) << 127;
-    memcpy(this->_buf, reinterpret_cast(&min_value), sizeof(int128_t));
-}
-
-template<>
-bool NumericField::is_min() {
-    int128_t min_value = (int128_t)(1) << 127;
-    if (*_value() == min_value) {
-        return true;
-    } else {
-        return false;
-    }
-}
-
-template <>
-void NumericField::set_to_max() {
-    int128_t max_value = ~((int128_t)(1) << 127);
-    memcpy(this->_buf, reinterpret_cast(&max_value), sizeof(int128_t));
-}
-
-OLAPStatus CharField::init() {
-    OLAPStatus res = BaseField::init();
-
-    return res;
-}
-
-OLAPStatus VarCharField::from_string(const std::string& value_string) {
-    size_t value_len = value_string.length();
-
-    if (value_len > (_buf_size - sizeof(LengthValueType))) {
-        OLAP_LOG_WARNING("the len of value string is too log[len=%lu, buf_size=%lu].",
-                value_len, _buf_size);
-        return OLAP_ERR_INPUT_PARAMETER_ERROR;
-    }
-
-    if (_buf == NULL) {
-        OLAP_LOG_WARNING("the buf is NULL!");
-        return OLAP_ERR_NOT_INITED;
-    }
-    
-    memcpy(_buf + sizeof(LengthValueType), value_string.c_str(), value_len);
-    *_length_ptr = value_len;
-
-    return OLAP_SUCCESS;
-}
-
-string VarCharField::to_string() const {
-    if (NULL == _buf) {
-        return "";
-    }
-    
-    string res(reinterpret_cast(_buf + sizeof(LengthValueType)), *_length_ptr);
-    
-    return res;
-}
-
-string VarCharField::to_buf() const {
-    if (NULL == _buf) {
-        return "";
-    }
-
-    string res(reinterpret_cast(_buf + sizeof(LengthValueType)), *_length_ptr);
-    return res;
-}
-
-OLAPStatus CharField::from_string(const std::string& value_string) {
-    if (_length == 0) {
-        return OLAP_ERR_NOT_INITED;
-    }
-
-    size_t value_len = value_string.length();
-    
-    if (value_len > _length) {
-        OLAP_LOG_WARNING("the len of value string is too long[len=%lu, m_aclloate_len=%lu].",
-                value_len, _length);
-        return OLAP_ERR_INPUT_PARAMETER_ERROR;
-    }
-
-    if (_buf != NULL) {
-        memset(_buf, 0, _length);
-        memcpy((void*)buf(), (const void*)value_string.c_str(), value_len);
-    }
-
-    return OLAP_SUCCESS;
-}
-
-string CharField::to_string() const {
-    if (NULL == _buf || _length == 0) {
-        return "";
-    }
-
-    string res(_buf, _length);
-    return res;
-}
-
-string CharField::to_buf() const {
-    if (NULL == _buf || _length == 0) {
-        return "";
-    }
-
-    string res(reinterpret_cast(_buf), _length);
-    return res;
-}
-    
-void HllSetResolver::parse() {
-    // skip LengthValueType
-    char*  pdata = _buf_ref;
-    _set_type = (HllDataType)pdata[0];
-    char* sparse_data = NULL;
-    switch (_set_type) {
-        case HLL_DATA_EXPLICIT:
-            // first byte : type
-            // second~five byte : hash values's number
-            // five byte later : hash value
-            _expliclit_num = (ExpliclitLengthValueType) (pdata[sizeof(SetTypeValueType)]);
-            _expliclit_value = (uint64_t*)(pdata + sizeof(SetTypeValueType) 
-                                                 + sizeof(ExpliclitLengthValueType));
-            break;
-        case HLL_DATA_SPRASE:
-            // first byte : type
-            // second ~(2^HLL_COLUMN_PRECISION)/8 byte : bitmap mark which is not zero
-            // 2^HLL_COLUMN_PRECISION)/8 + 1以åŽvalue
-            _sparse_count = (SparseLengthValueType*)(pdata + sizeof (SetTypeValueType));
-            sparse_data = pdata + sizeof(SetTypeValueType) + sizeof(SparseLengthValueType);
-            for (int i = 0; i < *_sparse_count; i++) {
-                SparseIndexType* index = (SparseIndexType*)sparse_data;
-                sparse_data += sizeof(SparseIndexType);
-                SparseValueType* value = (SparseValueType*)sparse_data;
-                _sparse_map[*index] = *value;
-                sparse_data += sizeof(SetTypeValueType);
-            }
-            break;
-        case HLL_DATA_FULL:
-            // first byte : type
-            // second byte later : hll register value
-            _full_value_position = pdata + sizeof (SetTypeValueType);
-            break;
-        default:
-            // HLL_DATA_EMPTY
-            break;
-    }
-}
-
-void HllSetResolver::fill_registers(char* registers, int len) {
-
-    if (_set_type == HLL_DATA_EXPLICIT) {
-        for (int i = 0; i < get_expliclit_count(); ++i) {
-            uint64_t hash_value = get_expliclit_value(i);
-            int idx = hash_value % len;
-            uint8_t first_one_bit = __builtin_ctzl(hash_value >> HLL_COLUMN_PRECISION) + 1;
-            registers[idx] = std::max((uint8_t)registers[idx], first_one_bit);
-        }
-    } else if (_set_type == HLL_DATA_SPRASE) {
-        std::map& sparse_map = get_sparse_map();
-        for (std::map::iterator iter = sparse_map.begin();
-                                                 iter != sparse_map.end(); iter++) {
-            registers[iter->first] = std::max((uint8_t)registers[iter->first], (uint8_t)iter->second);
-        }
-    } else if (_set_type == HLL_DATA_FULL) {
-        char* full_value = get_full_value();
-        for (int i = 0; i < len; i++) {
-            registers[i] = std::max((uint8_t)registers[i], (uint8_t)full_value[i]);
-        }
-        
-    } else {
-      // HLL_DATA_EMPTY
-    }
-}
-
-void HllSetResolver::fill_index_to_value_map(std::map* index_to_value, int len) {
-
-    if (_set_type == HLL_DATA_EXPLICIT) {
-        for (int i = 0; i < get_expliclit_count(); ++i) {
-            uint64_t hash_value = get_expliclit_value(i);
-            int idx = hash_value % len; 
-            uint8_t first_one_bit = __builtin_ctzl(hash_value >> HLL_COLUMN_PRECISION) + 1; 
-            if (index_to_value->find(idx) != index_to_value->end()) {
-                (*index_to_value)[idx] = (*index_to_value)[idx] < first_one_bit ? first_one_bit : (*index_to_value)[idx];
-            } else {
-                (*index_to_value)[idx] = first_one_bit;
-            }    
-        }    
-    } else if (_set_type == HLL_DATA_SPRASE) {
-        std::map& sparse_map = get_sparse_map();
-        for (std::map::iterator iter = sparse_map.begin();
-                                                 iter != sparse_map.end(); iter++) {
-            if (index_to_value->find(iter->first) != index_to_value->end()) {
-                (*index_to_value)[iter->first] = (*index_to_value)[iter->first] < iter->second ? iter->second : (*index_to_value)[iter->first];
-            } else {
-                (*index_to_value)[iter->first] = iter->second;
-            }        
-        }        
-    } else if (_set_type == HLL_DATA_FULL) {
-       char* registers = get_full_value();
-       for (int i = 0; i < len; i++) {
-           if (registers[i] != 0) {
-               if (index_to_value->find(i) != index_to_value->end()) {
-                   (*index_to_value)[i] = (*index_to_value)[i] < registers[i] ? registers[i]  : (*index_to_value)[i];
-               } else {
-                   (*index_to_value)[i] = registers[i];
-               } 
-           }
-       } 
-    } else { 
-      // HLL_DATA_EMPTY
-    }      
-}
-
-void HllSetResolver::fill_hash64_set(std::set* hash_set) {
-    if (_set_type == HLL_DATA_EXPLICIT) {
-        for (int i = 0; i < get_expliclit_count(); ++i) {
-            uint64_t hash_value = get_expliclit_value(i);
-            hash_set->insert(hash_value);
-        }    
-    } 
-}
- 
-void HllSetHelper::set_sparse(char *result, const std::map& index_to_value, int& len) {
-    result[0] = HLL_DATA_SPRASE;
-    len = sizeof(HllSetResolver::SetTypeValueType) + sizeof(HllSetResolver::SparseLengthValueType);
-    char* write_value_pos = result + len;
-    for (std::map::const_iterator iter = index_to_value.begin();
-         iter != index_to_value.end(); iter++) {
-        write_value_pos[0] = (char)(iter->first & 0xff);
-        write_value_pos[1] = (char)(iter->first >> 8 & 0xff);
-        write_value_pos[2] = iter->second;
-        write_value_pos += 3;
-    }
-    int registers_count = index_to_value.size();
-    len += registers_count * (sizeof(HllSetResolver::SparseIndexType) + sizeof(HllSetResolver::SparseValueType));
-    *(int*)(result + 1) = registers_count;
-}
-
-void HllSetHelper::set_expliclit(char* result, const std::set& hash_value_set, int& len) {
-    result[0] = HLL_DATA_EXPLICIT;
-    result[1] = (HllSetResolver::ExpliclitLengthValueType)hash_value_set.size();
-    len = sizeof(HllSetResolver::SetTypeValueType) + sizeof(HllSetResolver::ExpliclitLengthValueType);
-    char* writePosition = result + len;
-    for (std::set::const_iterator iter = hash_value_set.begin();
-         iter != hash_value_set.end(); iter++) {
-        uint64_t hash_value = *iter;
-        *(uint64_t*)writePosition = hash_value;
-        writePosition += 8;
-    }
-    len += sizeof(uint64_t) * hash_value_set.size();
-}
-
-void HllSetHelper::set_full(char* result, const char* registers, const int registers_len, int& len) {
-    result[0] = HLL_DATA_FULL;
-    memcpy(result + 1, registers, registers_len);
-    len = registers_len + sizeof(HllSetResolver::SetTypeValueType);
-}
-
-void HllSetHelper::set_full(char* result, 
-                            const std::map& index_to_value, 
-                            const int registers_len, int& len) {
-    result[0] = HLL_DATA_FULL;
-    for (std::map::const_iterator iter = index_to_value.begin();
-         iter != index_to_value.end(); iter++) {
-        result[1 + iter->first] = iter->second;
-    }
-    len = registers_len + sizeof(HllSetResolver::SetTypeValueType);
-}
-
-void HllSetHelper::set_max_register(char* registers, int registers_len, 
-                                    const std::set& hash_set) {
-    for (std::set::const_iterator iter = hash_set.begin();
-         iter != hash_set.end(); iter++) { 
-        uint64_t hash_value = *iter;
-        int idx = hash_value % registers_len;
-        uint8_t first_one_bit = __builtin_ctzl(hash_value >> HLL_COLUMN_PRECISION) + 1;     
-        registers[idx] = std::max((uint8_t)registers[idx], first_one_bit);
-    }    
-}
-
-template 
-void FieldHllUnionAggreator::finalize_one_merge(T t) {
-    char* buf = (char*)t;
-    if (!_has_value) {
-        return;
-    }
-    std::map index_to_value;
-    if (_hash64_set.size() > HLL_EXPLICLIT_INT64_NUM 
-              || _has_sparse_or_full) {
-        HllSetHelper::set_max_register(_registers, HLL_REGISTERS_COUNT, _hash64_set);
-        for (int i = 0; i < HLL_REGISTERS_COUNT; i++) {
-            if (_registers[i] != 0) { 
-                index_to_value[i] = _registers[i];
-            }    
-        } 
-    }    
-    int length_value_type_len = sizeof(VarCharField::LengthValueType);
-    int sparse_set_len = index_to_value.size() *
-                         (sizeof(HllSetResolver::SparseIndexType) 
-                         + sizeof(HllSetResolver::SparseValueType))
-                         + sizeof(HllSetResolver::SparseLengthValueType); 
-    int result_len = 0;
-
-    if (sparse_set_len >= HLL_COLUMN_DEFAULT_LEN) {
-        // full set
-        HllSetHelper::set_full(buf + length_value_type_len, _registers, 
-                               HLL_REGISTERS_COUNT, result_len);
-    } else if (index_to_value.size() > 0) {
-        // sparse set
-        HllSetHelper::set_sparse(buf + length_value_type_len, 
-                                 index_to_value, result_len); 
-    } else if (_hash64_set.size() > 0) {
-        // expliclit set
-        HllSetHelper::set_expliclit(buf + length_value_type_len,
-                                    _hash64_set, result_len);
-    } 
- 
-    VarCharField::LengthValueType* length_value = (VarCharField::LengthValueType*)buf;
-    *length_value = result_len & 0xffff;
-
-    reset();
-}
 }  // namespace palo
diff --git a/be/src/olap/field.h b/be/src/olap/field.h
index 72ccf09175..bf0efdcfee 100644
--- a/be/src/olap/field.h
+++ b/be/src/olap/field.h
@@ -16,358 +16,21 @@
 #ifndef BDG_PALO_BE_SRC_OLAP_FIELD_H
 #define BDG_PALO_BE_SRC_OLAP_FIELD_H
 
-#include 
-#include 
-
-#include 
-#include 
 #include 
 
-#include "gen_cpp/AgentService_types.h"
+#include "olap/aggregate_func.h"
+#include "olap/field_info.h"
 #include "olap/olap_common.h"
 #include "olap/olap_define.h"
+#include "olap/string_slice.h"
+#include "olap/types.h"
 #include "olap/utils.h"
-#include "util/mem_util.hpp"
-#include "olap/column_file/bloom_filter.hpp"
-#include "common/config.h"
-#include "runtime/string_value.h"
 #include "runtime/mem_pool.h"
+#include "util/hash_util.hpp"
+#include "util/mem_util.hpp"
 
 namespace palo {
 
-const static int HLL_COLUMN_PRECISION = 14;
-const static int HLL_EXPLICLIT_INT64_NUM = 160;
-const static int HLL_REGISTERS_COUNT = 16384;
-// regisers (2^14) + 1 (type)
-const static int HLL_COLUMN_DEFAULT_LEN = 16385;
-
-class AggregateFunctions;
- 
-// 定义uint24_t类型和需è¦çš„一些方法
-struct uint24_t {
-public:
-    uint24_t() {
-        memset(data, 0, sizeof(data));
-    }
-
-    uint24_t& operator+=(const uint24_t& value) {
-        *this = static_cast(*this) + static_cast(value);
-        return *this;
-    }
-
-    operator int() const {
-        int value = static_cast(data[0]);
-        value += (static_cast(static_cast(data[1]))) << 8;
-        value += (static_cast(static_cast(data[2]))) << 16;
-        return value;
-    }
-
-    uint24_t& operator=(const int& value) {
-        data[0] = static_cast(value);
-        data[1] = static_cast(value >> 8);
-        data[2] = static_cast(value >> 16);
-        return *this;
-    }
-
-    uint24_t& operator=(const int64_t& value) {
-        data[0] = static_cast(value);
-        data[1] = static_cast(value >> 8);
-        data[2] = static_cast(value >> 16);
-        return *this;
-    }
-
-private:
-    uint8_t data[3];
-} __attribute__((packed));
-
-struct decimal12_t {
-    decimal12_t() : integer(0), fraction(0) {}
-    decimal12_t(int64_t int_part, int32_t frac_part) {
-        integer = int_part;
-        fraction = frac_part;
-    }
-
-    decimal12_t& operator+=(const decimal12_t& value) {
-        fraction += value.fraction;
-        integer += value.integer;
-
-        if (fraction >= FRAC_RATIO) {
-            integer += 1;
-            fraction -= FRAC_RATIO;
-        } else if (fraction <= -FRAC_RATIO) {
-            integer -= 1;
-            fraction += FRAC_RATIO;
-        }
-
-        if (fraction * integer < 0) {
-            bool sign = integer < 0;
-            integer += (sign ? 1 : -1);
-            fraction += (sign ? -FRAC_RATIO : FRAC_RATIO);
-        }
-
-        //OLAP_LOG_WARNING("agg: int=%ld, frac=%d", integer, fraction);
-        //_set_flag();
-        return *this;
-    }
-
-    // call field::copy
-    decimal12_t& operator=(const decimal12_t& value) {
-        integer = value.integer;
-        fraction = value.fraction;
-        return *this;
-    }
-
-    bool operator>(const decimal12_t& value) const {
-        return cmp(value) > 0;
-    }
-
-    bool operator>=(const decimal12_t& value) const {
-        return cmp(value) >= 0;
-    }
-
-    bool operator==(const decimal12_t& value) const {
-        return cmp(value) == 0;
-    }
-
-    int32_t cmp(const decimal12_t& other) const {
-        if (integer > other.integer) {
-            return 1;
-        } else if (integer == other.integer) {
-            if (fraction > other.fraction) {
-                return 1;
-            } else if (fraction == other.fraction) {
-                return 0;
-            }
-        }
-
-        return -1;
-    }
-
-    std::string to_string() {
-        char buf[128] = {'\0'};
-
-        if (integer < 0 || fraction < 0) {
-            snprintf(buf, sizeof(buf), "-%lu.%09u",
-                     std::abs(integer), std::abs(fraction));
-        } else {
-            snprintf(buf, sizeof(buf), "%lu.%09u",
-                     std::abs(integer), std::abs(fraction));
-        }
-
-        return std::string(buf);
-    }
-
-    std::string to_buf() {
-        char buf[128] = {'\0'};
-
-        if (integer < 0 || fraction < 0) {
-            snprintf(buf, sizeof(buf), "-%lu.%09u",
-                     std::abs(integer), std::abs(fraction));
-        } else {
-            snprintf(buf, sizeof(buf), "%lu.%09u",
-                     std::abs(integer), std::abs(fraction));
-        }
-
-        return std::string(buf);
-    }
-
-    OLAPStatus from_string(const std::string& str) {
-        integer = 0;
-        fraction = 0;
-        const char* value_string = str.c_str();
-        const char* sign = strchr(value_string, '-');
-
-        if (sign != NULL) {
-            if (sign != value_string) {
-                return OLAP_ERR_INPUT_PARAMETER_ERROR;
-            } else {
-                ++value_string;
-            }
-        }
-
-        const char* sepr = strchr(value_string, '.');
-        if ((sepr != NULL && sepr - value_string > MAX_INT_DIGITS_NUM)
-                || (sepr == NULL && strlen(value_string) > MAX_INT_DIGITS_NUM)) {
-            integer = 999999999999999999;
-            fraction = 999999999;
-        } else {
-            if (sepr == value_string) {
-                sscanf(value_string, ".%9d", &fraction);
-                integer = 0;
-            } else {
-                sscanf(value_string, "%18ld.%9d", &integer, &fraction);
-            }
-
-            int32_t frac_len = (NULL != sepr) ?
-                               MAX_FRAC_DIGITS_NUM - strlen(sepr + 1) : MAX_FRAC_DIGITS_NUM;
-            frac_len = frac_len > 0 ? frac_len : 0;
-            fraction *= g_power_table[frac_len];
-        }
-
-        if (sign != NULL) {
-            fraction = -fraction;
-            integer = -integer;
-        }
-
-        return OLAP_SUCCESS;
-    }
-
-    static const int32_t FRAC_RATIO = 1000000000;
-    static const int32_t MAX_INT_DIGITS_NUM = 18;
-    static const int32_t MAX_FRAC_DIGITS_NUM = 9;
-
-    int64_t integer;
-    int32_t fraction;
-} __attribute__((packed));
-
-// èšé›†æ–¹æ³•的纯虚模æ¿åŸºç±»
-template 
-class FieldAggregator {
-public:
-    // èšé›†ç»“果存储在left上
-    virtual void operator()(T left, const T right, uint32_t length) = 0;
-
-    virtual ~FieldAggregator() {}
-
-    virtual void finalize_one_merge(T t) {}
-};
-
-// 什么都ä¸åš
-template 
-class FieldNoneAggregator : public FieldAggregator {
-public:
-    void operator()(T left, const T right, uint32_t length) {}
-};
-
-// MINèšé›†ï¼Œä¸¤ä¸ªå€¼ä¸­å–最å°çš„一个
-template 
-class FieldMinAggregator : public FieldAggregator {
-public:
-    void operator()(T left, const T right, uint32_t length) {
-        *left > *right ? *left = *right : *left;
-    }
-};
-
-// MAXèšé›†ï¼Œä¸¤ä¸ªå€¼ä¸­å–最大的一个
-template 
-class FieldMaxAggregator : public FieldAggregator {
-public:
-    void operator()(T left, const T right, uint32_t length) {
-        *right > *left ? *left = *right : *left;
-    }
-};
-
-// SUMèšé›†ï¼Œå°†å³å€¼ç´¯åŠ åˆ°å·¦å€¼ä¸­
-template 
-class FieldAddAggregator : public FieldAggregator {
-public:
-    void operator()(T left, const T right, uint32_t length) {
-        *left += *right;
-    }
-};
-
-// REPLACEèšé›†ï¼Œä½¿ç”¨å³å€¼æ›¿æ¢å·¦å€¼
-template 
-class FieldReplaceAggregator : public FieldAggregator {
-public:
-    void operator()(T left, const T right, uint32_t length) {
-        memcpy(left, right, length);
-    }
-};
-
-
-// ä¿å­˜Field元信æ¯çš„结构体
-struct FieldInfo {
-public:
-    // åç§°
-    std::string name;
-    // æ•°æ®ç±»åž‹
-    FieldType type;
-    // èšé›†æ–¹å¼
-    FieldAggregationMethod aggregation;
-    // 长度,å•ä½ä¸ºå­—节
-    // 除字符串外,其它类型都是确定的
-    uint32_t length;
-    // å‰ç¼€ç´¢å¼•长度,如果为0,表示ä¸ä½¿ç”¨å‰ç¼€ç´¢å¼•,
-    // å¦åˆ™åˆ™æŒ‰ç…§éœ€è¦è®¾ç½®å‰ç¼€ç´¢å¼•,ç›®å‰åªå¯¹å­—符串起作用。
-    uint32_t index_length;
-    // æ˜¯å¦æ˜¯Primary Key
-    bool is_key;
-
-    bool has_default_value;
-    std::string default_value;
-
-    bool has_referenced_column;
-    std::string referenced_column;
-
-    // used to creating decimal data type
-    uint32_t precision;
-    uint32_t frac;
-
-    bool is_allow_null;
-    // 全局唯一id
-    uint32_t unique_id;
-    // å­åˆ—çš„index
-    std::vector sub_columns;
-    // æ˜¯å¦æ˜¯å…¶ä»–列的å­åˆ—
-    bool is_root_column;
-
-    // is bloom filter column
-    bool is_bf_column;
-public:
-    static std::string get_string_by_field_type(FieldType type);
-    static std::string get_string_by_aggregation_type(FieldAggregationMethod aggregation_type);
-    static FieldType get_field_type_by_string(const std::string& str);
-    static FieldAggregationMethod get_aggregation_type_by_string(const std::string& str);
-    static uint32_t get_field_length_by_type(TPrimitiveType::type type, uint32_t string_length);
-
-    OLAPStatus set_default_value(const char* str) {
-        default_value = str;
-
-        return OLAP_SUCCESS;
-    }
-
-    std::string to_string() const {
-        char buf[1024] = {'\0'};
-        snprintf(buf, sizeof(buf), "{name='%s' type='%s' aggregation='%s' length=%u is_key=%d "
-                 "is_allow_null=%d}",
-                 name.c_str(),
-                 get_string_by_field_type(type).c_str(),
-                 get_string_by_aggregation_type(aggregation).c_str(),
-                 length,
-                 is_key,
-                 is_allow_null);
-        return std::string(buf);
-    }
-
-    std::string to_buf() const {
-        char buf[1024] = {'\0'};
-        snprintf(buf, sizeof(buf), "{name='%s' type='%s' aggregation='%s' length=%u is_key=%d "
-                 "is_allow_null=%d}",
-                 name.c_str(),
-                 get_string_by_field_type(type).c_str(),
-                 get_string_by_aggregation_type(aggregation).c_str(),
-                 length,
-                 is_key,
-                 is_allow_null);
-        return std::string(buf);
-    }
-
-    std::string to_json() const {
-        char buf[1024] = {'\0'};
-        snprintf(buf, sizeof(buf), "{\"name\":\"%s\",\"type\":\"%s\",\"aggregation\":\"%s\","
-                 "\"length\":%u,\"is_key\":%d,\"is_allow_null\":%d}",
-                 name.c_str(),
-                 get_string_by_field_type(type).c_str(),
-                 get_string_by_aggregation_type(aggregation).c_str(),
-                 length,
-                 is_key,
-                 is_allow_null);
-        return std::string(buf);
-    }
-};
-
-// Field的基类,定义接å£ï¼ŒåŒæ—¶ä¹Ÿæä¾›åˆ›å»ºç‰¹å®šç±»åž‹çš„Fieldå®žä¾‹çš„é™æ€æ–¹æ³•
 // Fieldå†…éƒ¨å‚æ•°ä¸ºField*çš„æ–¹æ³•éƒ½è¦æ±‚实例类型和当å‰ç±»åž‹ä¸€è‡´ï¼Œå¦åˆ™ä¼šäº§ç”Ÿæ— æ³•预知的错误
 // å‡ºäºŽæ•ˆçŽ‡çš„è€ƒè™‘ï¼Œå¤§éƒ¨åˆ†å‡½æ•°å®žçŽ°å‡æ²¡æœ‰å¯¹å‚数进行检查
 class Field {
@@ -376,1188 +39,211 @@ public:
     // æ ¹æ®ç±»åž‹çš„ä¸åŒï¼Œä½¿ç”¨ä¸åŒçš„类模æ¿å‚数或者å­ç±»
     // 对于没有预料到的类型,会返回NULL
     static Field* create(const FieldInfo& field_info);
-    static Field* create_by_type(const FieldType& field_type);
+    static Field* create_by_type(const FieldType& type);
 
-    Field() {
-        _buf = NULL;
-        _is_allocated = false;
-        _overhead = 0;
-        _length_ptr = &_length;
-    }
-    virtual ~Field() {
-        if (_is_allocated) {
-            SAFE_DELETE_ARRAY(_field_buf);
-        }
+    Field(const FieldInfo& field_info);
+
+    inline void set_offset(size_t offset) { _offset = offset; }
+    inline size_t get_offset() const { return _offset; }
+
+    //get ptr without NULL byte
+    inline char* get_ptr(char* buf) const { return buf + _offset + 1; }
+
+    //get ptr with NULL byte
+    inline char* get_field_ptr(char* buf) const { return buf + _offset; }
+
+    inline size_t size() const { return _size; }
+    inline size_t field_size() const { return _size + 1; }
+    inline size_t index_size() const { return _index_size; }
+
+    inline void set_to_max(char* buf) { return _type_info->set_to_max(buf); }
+    inline void set_to_min(char* buf) { return _type_info->set_to_min(buf); }
+    inline bool is_min(char* buf) { return _type_info->is_min(buf); }
+
+    inline bool is_null(char* buf) const {
+        return *reinterpret_cast(buf + _offset);
     }
 
-    // åˆå§‹åŒ–Field
-    virtual OLAPStatus init() = 0;
-
-    virtual bool allocate() {
-        _is_allocated = true;
-
-        if (NULL == (_field_buf = new(std::nothrow) char[_buf_size + sizeof(char)])) {
-            OLAP_LOG_FATAL("failed to malloc _field_buf. [size=%ld]", _buf_size + sizeof(char));
-            return false;
-        }
-        memset(_field_buf, 0, _buf_size + sizeof(char));
-        _is_null = _field_buf;
-        _buf = _field_buf + 1;
-
-        return true;
+    inline void set_null(char* buf) const {
+        *reinterpret_cast(buf + _offset) = true;
     }
 
-    inline void set_buf_size(size_t size) {
-        _buf_size = size;
+    inline void set_not_null(char* buf) const {
+        *reinterpret_cast(buf + _offset) = false;
     }
 
-    inline void set_string_length(uint16_t len) {
-       _length = len; 
-    } 
-
-    inline uint16_t get_string_length() const{
-        return _length;  
-    }
-
-    inline size_t get_buf_size() const{
-        return _buf_size;
-    }
-
-    // 判断两个Field是å¦ç›¸ç­‰
-    inline bool equal(const Field* field);
-
     // 返回-1,0,1,分别代表当å‰fieldå°äºŽï¼Œç­‰äºŽï¼Œå¤§äºŽä¼ å…¥å‚数中的field
-    inline int cmp(const Field* field) const;
+    inline int cmp(char* left, char* right) const;
+    inline int index_cmp(char* left, char* right) const;
+    inline bool equal(char* left, char* right);
 
-    inline int index_cmp(const Field* field) const;
+    inline void aggregate(char* dest, char* src);
+    inline void finalize(char* data);
 
-    // 实际的比较方法的实现, 直接调用会有虚函数开销
-    virtual int real_cmp(const Field* field) const = 0;
+    inline void copy_with_pool(char* dest, const char* src, MemPool* mem_pool);
+    inline void copy_without_pool(char* dest, const char* src);
+    inline void agg_init(char* dest, const char* src);
 
-    // 调用对应的èšé›†æ–¹æ³•
-    // 具体的行为å–决于使用上é¢å®šä¹‰çš„哪一ç§èšé›†æ–¹æ³•
-    // 调用方为左值,传入的为å³å€¼
-    virtual void aggregate(const Field* field) = 0;
-    
-    virtual void finalize_one_merge() = 0;
-
-    virtual FieldAggregationMethod get_aggregation_method() = 0;
-
-    // 返回当å‰å­—段的长度, å•使˜¯å­—节
-    inline size_t size() const {
-        return *_length_ptr + _overhead;
+    // copy filed content from src to dest without nullbyte
+    inline void copy_content(char* dest, const char* src, MemPool* mem_pool) {
+        _type_info->copy_with_pool(dest, src, mem_pool);
     }
+    inline void to_index(char* dest, const char* src);
 
-    inline size_t field_size() const {
-        //add the byte occupied by _is_null
-        return size() + sizeof(char);
+    // used by init scan key stored in string format
+    // value_string should end with '\0'
+    inline OLAPStatus from_string(char* buf, const std::string& value_string) {
+        return _type_info->from_string(buf, value_string);
     }
 
-    // 返回作为index的长度,由于有å‰ç¼€ç´¢å¼•的存在, 所以å¯èƒ½å’Œå®žé™…字段的长度ä¸åŒ
-    virtual size_t index_size() const {
-        return _index_length;
-    }
-
-    // 返回当å‰å­—段在MySQL中的长度, å•使˜¯å­—节
-    virtual size_t mysql_size() const {
-        return size();
-    }
-
-    // 从å¦ä¸€ä¸ªfield对象中拷è´å€¼åˆ°å†…部指å‘çš„buffer中
-    inline void copy(const Field* field);
-
     // 将内部的value转æˆstring输出
     // 没有考虑实现的性能,仅供DEBUG使用
-    virtual std::string to_string() const = 0;
-    virtual std::string to_buf() const = 0;
-
-    // æŠŠå½“å‰æŒ‡å‘çš„buf中的数æ®è½¬ä¸ºMySQL的格å¼
-    // ç”±äºŽç›®å‰æ‰€æœ‰Field类型的长度都和MySQL一致, 因此实现原地转æ¢ä»¥é¿å…多数情况下的memcpy
-    inline void to_mysql();
-
-    // 转为内部存储格å¼å¹¶å†™å…¥buf
-    virtual void to_storage(char* buf) const = 0;
-    // 转为索引格å¼å¹¶å†™å…¥buf,é™å®šè¾“出长度
-    virtual void to_index(char* buf) = 0;
-
-    // 转为MySQLæ ¼å¼å¹¶å†™å…¥buf
-    virtual void to_mysql(char* buf) const = 0;
-
-    // 从传入的字符串ååºåˆ—化field的值
-    // 傿•°å¿…须是一个\0结尾的字符串
-    virtual OLAPStatus from_string(const std::string& value_string) = 0;
-
-    // ä»Žæ•°æ®æ–‡ä»¶ä¸­è¯»å–一个值,并写入内部buffer
-    virtual void from_storage(const char* buf) = 0;
-
-    // attach到一段buf
-    virtual void attach_buf(char* buf) {
-        _buf = buf;
+    inline std::string to_string(char* src) const {
+        return _type_info->to_string(src);
     }
 
-    virtual void attach_field(char* field) {
-        _is_null = field;
-        _buf = field + sizeof(char);
-    }
-
-    // èŽ·å–æŒ‡å‘内部buf的指针
-    char* buf() const {
-        return _buf;
-    }
-
-    char* get_null() const {
-        return _is_null;
-    }
-
-    bool is_null() const {
-        return *reinterpret_cast(_is_null);
-    }
-
-    void set_null() {
-        _is_null[0] |= 1;
-    }
-
-    void set_not_null() {
-        _is_null[0] &= ~1;
-    }
-
-    FieldType type() const {
-        return _field_type;
-    }
-
-    virtual void set_to_max() {}
-    virtual void set_to_min() {}
-    virtual bool is_min() {
-        return false;
-    }
-
-protected:
-    char* _is_null;
-    // 存储数æ®çš„Buffer
-    char* _buf;
-    char* _field_buf;//store _buf and _is_null
-    // buf是å¦ç”±å†…部分é…管ç†
-    bool _is_allocated;
+    inline uint32_t hash_code(char* data, uint32_t seed) const;
+private:
+    FieldType _type;
     // Field的长度,å•ä½ä¸ºå­—节
-    uint16_t _length;
-    uint16_t* _length_ptr;
-    uint16_t _overhead;
+    uint16_t _size;
     // Field的最大长度,å•ä½ä¸ºå­—节,通常等于length, å˜é•¿å­—符串ä¸åŒ
-    size_t _index_length;
-    //
-    size_t _buf_size;
-    // Field类型
-    FieldType _field_type;
+    uint16_t _index_size;
+    size_t _offset; //offset in row buf
+    TypeInfo* _type_info;
+
+    AggregateFunc _aggregate_func;
+    FinalizeFunc _finalize_func;
 };
 
-// 继承自Field的基础模æ¿ç±»
-template 
-class BaseField : public Field {
-public:
-    BaseField(const FieldInfo& field_info) {
-        _length = sizeof(T);
-        _buf_size = _length;
-        _index_length = field_info.index_length;
-        _aggregation = field_info.aggregation;
-        _aggregator = NULL;
-        _buf = NULL;
-    }
-
-    virtual ~BaseField() {
-        SAFE_DELETE(_aggregator);
-    }
-
-    virtual OLAPStatus init();
-
-    virtual int real_cmp(const Field* field) const {
-        T left = *_value();
-        T right = *_value(field->buf());
-
-        if (left > right) {
-            return 1;
-        } else if (right > left) {
-            return -1;
-        } else {
-            return 0;
-        }
-    }
-
-    virtual FieldAggregationMethod get_aggregation_method() {
-        return _aggregation;
-    }
-
-    virtual void aggregate(const Field* field) {
-        (*_aggregator)(_value(), _value(field->buf()), field->size());
-    }
-
-    virtual void finalize_one_merge() {
-        _aggregator->finalize_one_merge(_value());
-    } 
-
-    virtual std::string to_string() const;
-    virtual std::string to_buf() const;
-    virtual void to_storage(char* buf) const {
-        memcpy(buf, _buf, sizeof(T));
-    }
-    virtual void to_index(char* index_buf) {
-        memcpy(index_buf, _is_null, sizeof(char));
-        to_storage(index_buf + sizeof(char));
-    }
-    virtual void to_mysql(char* buf) const {
-        memcpy(buf, _buf, sizeof(T));
-    }
-
-    virtual void from_storage(const char* buf) {
-        memcpy(_buf, buf, sizeof(T));
-    }
-    virtual OLAPStatus from_string(const std::string& value_string) {return OLAP_SUCCESS;}
-
-protected:
-    T* _value() const {
-        return reinterpret_cast(_buf);
-    }
-    static T* _value(char* buf) {
-        return reinterpret_cast(buf);
-    }
-
-protected:
-    // èšé›†æ–¹æ³•
-    FieldAggregationMethod _aggregation;
-    // èšé›†æ–¹æ³•对应的实现类
-    FieldAggregator* _aggregator;
-};
-
-// 实现对所有数值类Field类型的处ç†
-// 继承自BaseField
-template 
-class NumericField : public BaseField {
-public:
-    NumericField(const FieldInfo& field_info) : BaseField(field_info) {};
-    virtual void set_to_max() {
-        T max_value = std::numeric_limits::max();
-        memcpy(this->_buf, reinterpret_cast(&max_value), sizeof(T));
-    };
-    virtual void set_to_min() {
-        T min_value = std::numeric_limits::min();
-        memcpy(this->_buf, reinterpret_cast(&min_value), sizeof(T));
-    };
-    virtual bool is_min() {
-        T min_value = std::numeric_limits::min();
-        if (*(this->_value()) == min_value) {
-            return true;
-        } else {
-            return false;
-        }
-    }
-
-};
-
-// 实现对CHAR类型的处ç†
-// 继承自BaseField,å¯ä»¥å¤ç”¨å¤§éƒ¨åˆ†æ–¹æ³•
-class CharField : public BaseField {
-public:
-    CharField(const FieldInfo& field_info)
-        : BaseField(field_info) {
-        _length = field_info.length;
-        _buf_size = _length;
-    }
-
-    virtual ~CharField() {}
-
-    virtual OLAPStatus init();
-
-    virtual std::string to_string() const;
-    virtual std::string to_buf() const;
-    virtual OLAPStatus from_string(const std::string& value_string);
-    virtual void from_storage(const char* buf) {
-        memcpy(_buf, buf, _length);
-    }
-    virtual void to_storage(char* buf) const {
-        memcpy(buf, _buf, _length);
-    }
-    virtual void to_mysql(char* buf) const {
-        memcpy(buf, _buf, _length);
-    }
-
-    virtual void set_to_max() {
-        if (NULL != _buf) {
-            memset(_buf, 0xff, _length);
-        }
-    }
-    virtual void set_to_min() {
-        if (NULL != _buf) {
-            memset(_buf, 0, _length);
-        }
-    }
-
-    virtual bool is_min() {
-        if (_buf == NULL || strlen(_buf) == 0) {
-            return true;
-        } else {
-            return false;
-        }
-    }
-
-    int real_cmp(const Field* field) const {
-        uint16_t field_len = field->get_string_length();
-        uint16_t cmp_len = std::min(_length, field_len);
-        int res = strncmp(_buf, field->buf(), cmp_len);
-
-        if (res > 0) {
-            return 1;
-        } else if (0 == res) {
-            if (_length > field_len) {
-                return 1;
-            } else if (_length < field_len) {
-                return -1;
-            }
-            return 0;
-        } else {
-            return -1;
-        }
-    }
-};
-
-class VarCharField : public CharField {
-public:
-    VarCharField(const FieldInfo& field_info)
-        : CharField(field_info) {
-        _buf_size = field_info.length;
-        _overhead = sizeof(uint16_t);
-    }
-
-    virtual void attach_field(char* buf) {
-        _is_null = buf;
-        _buf = buf + sizeof(char);
-        _length_ptr = reinterpret_cast(_buf);
-    }
-
-    virtual void attach_buf(char* buf) {
-        _buf = buf;
-        _length_ptr = reinterpret_cast(_buf);
-    }
-
-    virtual bool allocate() {
-        if (Field::allocate()) {
-            attach_buf(_buf);
-            *_length_ptr = 0;
-            return true;
-        }
-
-        return false;
-    }
-
-    // é‡å†™size,字符串的长度å‡ä½¿ç”¨åЍæ€èŽ·å–
-
-    // å‡éœ€è¦å¦å¤–实现
-    virtual std::string to_string() const;
-    virtual std::string to_buf() const;
-    virtual OLAPStatus from_string(const std::string& value_string);
-
-    // é‡ç‚¹æ˜¯ï¼Œå¦‚何知é“该拷è´å¤šé•¿ï¼Œéœ€è¦è§£æžå¤´
-    virtual void from_storage(const char* buf) {
-        size_t copy_length = _get_length_from_buf(buf) + sizeof(LengthValueType);
-        copy_length = _buf_size < copy_length ? _buf_size : copy_length;
-        memcpy(_buf, buf, copy_length);
-        _set_length_to_buf(_buf, copy_length - sizeof(LengthValueType));
-    }
-
-    void from_storage_length(const char* buf, int length) {
-        size_t copy_length = length + sizeof(LengthValueType);
-        copy_length = _buf_size < copy_length ? _buf_size : copy_length;
-        memcpy(_buf + sizeof(LengthValueType), buf, copy_length - sizeof(LengthValueType));
-        _set_length_to_buf(_buf, length);
-    }
-
-
-    virtual void to_storage(char* buf) const {
-        memcpy(buf, _buf, size());
-    }
-
-    virtual void to_index(char* index_buf) {
-        
-        size_t copy_size = size() < _index_length?
-                           size() : _index_length;
-        // å…ˆæ¸…é›¶ï¼Œå†æ‹·è´
-        memset(index_buf, 0, _index_length + sizeof(char));
-        memcpy(index_buf, _is_null, sizeof(char));
-        memcpy(index_buf + sizeof(char), _buf, copy_size);
-        _set_length_to_buf(index_buf + sizeof(char), copy_size - sizeof(LengthValueType));
-    }
-
-    virtual void to_mysql(char* buf) const {
-        memcpy(buf, _buf, size());
-    }
-
-    virtual void set_to_max() {
-        if (NULL != _buf) {
-            _set_length_to_buf(_buf, 1);
-            memset(_buf + sizeof(LengthValueType), 0xFF, 1);
-        }
-    }
-
-    virtual void set_to_min() {
-        if (NULL != _buf) {
-            _set_length_to_buf(_buf, 0);
-        }
-    }
-
-    virtual bool is_min() {
-        if (NULL == _buf || 0 == _get_length_from_buf(_buf)) {
-            return true;
-        } else {
-            return false;
-        }
-    }
-
-    int real_cmp(const Field* field) const {
-        return varchar_cmp(_buf + sizeof(LengthValueType),
-                           size() - sizeof(LengthValueType),
-                           field->buf() + sizeof(LengthValueType),
-                           field->size() - sizeof(LengthValueType));
-    }
-
-    static int varchar_cmp(const char* buf1, size_t length1, const char* buf2, size_t length2) {
-        size_t compare_size = std::min(length1, length2);
-        int res = strncmp(buf1, buf2, compare_size);
-
-        if (res > 0) {
-            return 1;
-        } else if (0 == res) {
-            if (length1 > length2) {
-                return 1;
-            } else if (length1 == length2) {
-                return 0;
-            }
-        }
-
-        return -1;
-    }
-
-    int short_key_cmp(const Field* field) const {
-        // 如果field的实际长度比short key长,则仅比较å‰ç¼€ï¼Œç¡®ä¿ç›¸åŒshort key的所有block都被扫æï¼Œ
-        // å¦åˆ™ï¼Œå¯ä»¥ç›´æŽ¥æ¯”较short keyå’Œfield
-        int res = 0;
-
-        if (field->size() > _index_length) {
-            int compare_size = _index_length - sizeof(LengthValueType);
-            res = strncmp(_buf + sizeof(LengthValueType),
-                          field->buf() + sizeof(LengthValueType), compare_size);
-        } else {
-            res = real_cmp(field);
-        }
-
-        return res;
-    }
-
-    typedef uint32_t OffsetValueType;
-    typedef uint16_t LengthValueType;
-protected:
-    inline LengthValueType _get_length_from_buf(const char* buf) {
-        return *reinterpret_cast(buf);
-    }
-
-    inline void _set_length_to_buf(char* buf, LengthValueType length) {
-        *reinterpret_cast(buf) = length;
-    }
-};
-
-// help parse hll set
-class HllSetResolver {
-        
-public:
-    HllSetResolver() : _buf_ref(nullptr),
-                       _buf_len(0),
-                       _set_type(HLL_DATA_EMPTY),
-                       _full_value_position(nullptr),
-                       _expliclit_value(nullptr),
-                       _expliclit_num(0) {
-    }
-
-    typedef uint8_t SetTypeValueType;
-    typedef uint8_t ExpliclitLengthValueType;
-    typedef int32_t SparseLengthValueType;
-    typedef uint16_t SparseIndexType;
-    typedef uint8_t SparseValueType;
-
-    // only save pointer 
-    void init(char* buf, int len){
-        this->_buf_ref = buf;
-        this->_buf_len = len;
-    }
-
-    // hll set type
-    HllDataType get_hll_data_type() { 
-        return _set_type;
-    };
-    
-    // expliclit value num
-    int get_expliclit_count() { 
-        return (int)_expliclit_num; 
-    };
-    
-    // get expliclit index value 64bit
-    uint64_t get_expliclit_value(int index) {
-        if (index >= _expliclit_num) {
-            return -1;
-        }
-        return _expliclit_value[index];
-    };
-
-    // get expliclit index value 64bit
-    char* get_expliclit_value() { 
-        return (char*)_expliclit_value; 
-    };
-
-    // get full register value
-    char* get_full_value() { 
-        return _full_value_position; 
-    };
-
-    // get sparse (index, value) count
-    int get_sparse_count() { 
-        return (int)*_sparse_count; 
-    };
-
-    // get (index, value) map
-    std::map& get_sparse_map() { 
-        return _sparse_map; 
-    };
-    
-    // parse set , call after copy() or init()
-    void parse();
-   
-    // fill registers with set
-    void fill_registers(char* registers, int len);
-
-    // fill map with set
-    void fill_index_to_value_map(std::map* index_to_value, int len);
-    
-    // fill hash map
-    void fill_hash64_set(std::set* hash_set);
-    
-private :
-    
-    char* _buf_ref;    // set
-    int _buf_len;      // set len
-    HllDataType _set_type;        //set type
-    char* _full_value_position;
-    uint64_t* _expliclit_value;
-    ExpliclitLengthValueType _expliclit_num;
-    std::map _sparse_map;
-    SparseLengthValueType* _sparse_count;
-};
-
-class HllSetHelper {
-
-public:
-
-    static void set_sparse(char *result,const std::map& index_to_value, int& len);
-
-    static void set_expliclit(char* result, const std::set& hash_value_set, int& len);
-
-    static void set_full(char* result, const char* registers, const int set_len, int& len);
-
-    static void set_full(char* result, const std::map& index_to_value, 
-                         const int set_len, int& len);
-
-    static void set_max_register(char *registers,
-                                 int registers_len, 
-                                 const std::set& hash_set);
-};
-
-// 通过varcharçš„å˜é•¿ç¼–ç æ–¹å¼å®žçްhll集åˆ
-// 实现hll列中间计算结果的处ç†
-// empty 空集åˆ
-// expliclit 存储64ä½hash值的集åˆ
-// sparse 存储hlléž0çš„register
-// full  存储全部的hll register
-// empty -> expliclit -> sparse -> full å››ç§ç±»åž‹çš„è½¬æ¢æ–¹å‘ä¸å¯é€†
-// 第一个字节存放hll集åˆçš„类型 0:empty 1:expliclit 2:sparse 3:full
-// 已决定åŽé¢çš„æ•°æ®æ€Žä¹ˆè§£æž
-class HllField : public VarCharField {
-  
-public:
-    
-    HllField(const FieldInfo& field_info)
-    : VarCharField(field_info) {
-    }
-    
-    void parse() {
-        resolver.init(_buf + sizeof(VarCharField::LengthValueType),*reinterpret_cast(_buf));
-        resolver.parse();
-    }
-    // hll set type
-    HllDataType get_hll_data_type() { return resolver.get_hll_data_type();};
-    
-    // expliclit value num
-    int get_expliclit_count() { return resolver.get_expliclit_count(); };
-    
-    // get expliclit index value 64bit
-    uint64_t get_expliclit_value(int index) {
-        return resolver.get_expliclit_value(index);
-    };
-    
-    char* get_expliclit_value() {
-        return resolver.get_expliclit_value();
-    };
-    
-    // get full register value
-    char* get_full_value() { return resolver.get_full_value(); };
-    
-    // get sparse bitset
-    int get_sparse_count() { return resolver.get_sparse_count();};
-    
-    // get sparse register value
-    std::map& get_sparse_map() { 
-        return resolver.get_sparse_map();
-    };
-    
-    HllSetResolver* getHllSetResolver() { return &resolver;};
-    
-private :
-    
-    HllSetResolver resolver;
-};
- 
-
-template 
-class FieldHllUnionAggreator : public FieldAggregator {
-
-public: 
-
-    FieldHllUnionAggreator() {
-        reset();
-    } 
-    
-    void operator()(T left, const T right, uint32_t length) {
-        char* left_buf =  (char*)left;
-        char* right_buf = (char*)right;
-        // parse set
-        if (!_has_value) {
-            fill_set(left_buf);
-            _has_value = true;
-        }
-        fill_set(right_buf);
-    }
-
-    void fill_set(char* buf) {
-        HllSetResolver resolver; 
-        int length_value_type_len = sizeof(VarCharField::LengthValueType);
-        resolver.init(buf + length_value_type_len,
-                           *reinterpret_cast(buf));
-        resolver.parse();
-        if (resolver.get_hll_data_type() == HLL_DATA_EXPLICIT) {
-            // expliclit set
-            resolver.fill_hash64_set(&_hash64_set);
-        } else if (resolver.get_hll_data_type() != HLL_DATA_EMPTY) {
-            // full or sparse
-            _has_sparse_or_full = true;
-            resolver.fill_registers(_registers, HLL_REGISTERS_COUNT);
-        } else {
-            // empty
-        }
-    }
-
-    virtual void finalize_one_merge(T t);
-
-    void reset() {
-        memset(_registers, 0, HLL_REGISTERS_COUNT);
-        _hash64_set.clear();
-        _has_value = false;
-        _has_sparse_or_full = false;
-    }
-
-private:
-
-    bool _has_value;
-    bool _has_sparse_or_full;
-    char _registers[HLL_REGISTERS_COUNT];
-    std::set _hash64_set;
-};
-
-class DateTimeField : public BaseField {                
-    public:
-    DateTimeField(const FieldInfo& field_info) : BaseField(field_info) {}
-
-    virtual OLAPStatus from_string(const std::string& value_string) {
-        tm time_tm;
-        char* res = strptime(value_string.c_str(), "%Y-%m-%d %H:%M:%S", &time_tm);
-
-        if (NULL != res) {
-            long value = ((time_tm.tm_year + 1900) * 10000L
-                          + (time_tm.tm_mon + 1) * 100L
-                          + time_tm.tm_mday) * 1000000L
-                         + time_tm.tm_hour * 10000L
-                         + time_tm.tm_min * 100L
-                         + time_tm.tm_sec;
-            from_storage(reinterpret_cast(&value));
-        } else {
-            // 1400 - 01 - 01
-            *_value() = 14000101000000;
-        }
-
-        return OLAP_SUCCESS;
-    }
-
-    virtual std::string to_string() const {
-        tm time_tm;
-        long tmp = *_value();
-        long part1 = (tmp / 1000000L);
-        long part2 = (tmp - part1 * 1000000L);
-
-        time_tm.tm_year = static_cast((part1 / 10000L) % 10000) - 1900;
-        time_tm.tm_mon = static_cast((part1 / 100) % 100) - 1;
-        time_tm.tm_mday = static_cast(part1 % 100);
-
-        time_tm.tm_hour = static_cast((part2 / 10000L) % 10000);
-        time_tm.tm_min = static_cast((part2 / 100) % 100);
-        time_tm.tm_sec = static_cast(part2 % 100);
-
-        char buf[20] = {'\0'};
-        strftime(buf, 20, "%Y-%m-%d %H:%M:%S", &time_tm);
-        return std::string(buf);
-    }
-
-    virtual std::string to_buf() const {
-        tm time_tm;
-        long tmp = *_value();
-        long part1 = (tmp / 1000000L);
-        long part2 = (tmp - part1 * 1000000L);
-
-        time_tm.tm_year = static_cast((part1 / 10000L) % 10000) - 1900;
-        time_tm.tm_mon = static_cast((part1 / 100) % 100) - 1;
-        time_tm.tm_mday = static_cast(part1 % 100);
-
-        time_tm.tm_hour = static_cast((part2 / 10000L) % 10000);
-        time_tm.tm_min = static_cast((part2 / 100) % 100);
-        time_tm.tm_sec = static_cast(part2 % 100);
-
-        char buf[20] = {'\0'};
-        strftime(buf, 20, "%Y-%m-%d %H:%M:%S", &time_tm);
-        return std::string(buf);
-    }
-
-    virtual void set_to_max() {
-        // 设置为最大时间,其å«ä¹‰ä¸ºï¼š9999-12-31 23:59:59
-        long value = 99991231235959L;
-        from_storage(reinterpret_cast(&value));
-    }
-    virtual void set_to_min() {
-        long value = 101000000;
-        from_storage(reinterpret_cast(&value));
-    }
-    virtual bool is_min() {
-        long value = 101000000;
-        if (*(reinterpret_cast(_buf)) == value) {
-            return true;
-        } else {
-            return false;
-        }
-    }
-
-};
-
-// 实现对Date类型的处ç†
-// MySQL内部使用3个字节存储
-// 具体格å¼ä¸º: year * 16 * 32 + month * 32 + day
-// 这里也直接采用MySQL的存储格å¼
-// 继承自BaseFiled
-class DateField : public BaseField {
-public:
-    DateField(const FieldInfo& field_info) : BaseField(field_info) {}
-
-    virtual OLAPStatus from_string(const std::string& value_string) {
-        tm time_tm;
-        char* res = strptime(value_string.c_str(), "%Y-%m-%d", &time_tm);
-
-        if (NULL != res) {
-            int value = (time_tm.tm_year + 1900) * 16 * 32
-                        + (time_tm.tm_mon + 1) * 32
-                        + time_tm.tm_mday;
-            *_value() = value;
-        } else {
-            // 1400 - 01 - 01
-            *_value() = 716833;
-        }
-
-        return OLAP_SUCCESS;
-    }
-
-    virtual std::string to_string() const {
-        tm time_tm;
-        int value = *_value();
-        memset(&time_tm, 0, sizeof(time_tm));
-        time_tm.tm_mday = static_cast(value & 31);
-        time_tm.tm_mon = static_cast(value >> 5 & 15) - 1;
-        time_tm.tm_year = static_cast(value >> 9) - 1900;
-        char buf[20] = {'\0'};
-        strftime(buf, sizeof(buf), "%Y-%m-%d", &time_tm);
-        return std::string(buf);
-    }
-
-    virtual std::string to_buf() const {
-        tm time_tm;
-        int value = *_value();
-        memset(&time_tm, 0, sizeof(time_tm));
-        time_tm.tm_mday = static_cast(value & 31);
-        time_tm.tm_mon = static_cast(value >> 5 & 15) - 1;
-        time_tm.tm_year = static_cast(value >> 9) - 1900;
-        char buf[20] = {'\0'};
-        strftime(buf, sizeof(buf), "%Y-%m-%d", &time_tm);
-        return std::string(buf);
-    }
-
-    virtual void set_to_max() {
-        int value = 9999 * 16 * 32
-                    + 12 * 32
-                    + 31;
-        *_value() = value;
-    }
-
-    virtual void set_to_min() {
-        int value = 0 * 16 * 32 + 1 * 32 + 1;
-        *_value() = value;
-    }
-
-    virtual bool is_min() {
-        if (33 == *_value()) {
-            return true;
-        } else {
-            return false;
-        }
-    }
-
-};
-
-// 实现对DISCRETE DOUBLE类型的处ç†
-// 由于内部存储使用int64实现
-// 因此继承自BaseFiled
-class DiscreteDoubleField : public NumericField {
-public:
-    DiscreteDoubleField(const FieldInfo& field_info) : NumericField(field_info) {};
-
-    size_t mysql_size() const {
-        return sizeof(double);
-    }
-
-    // 把数æ®è½¬ä¸ºdouble
-    double to_double() const {
-        return *_value() / RATIO;
-    }
-
-    virtual void to_mysql(char* buf) const {
-        double value = to_double();
-        memcpy(buf, &value, sizeof(double));
-    }
-
-    // 转æ¢ä¸ºå¯æ‰“å°çš„字符串
-    virtual std::string to_string() const {
-        double value = *_value() / RATIO;
-        char buf[1024] = {'\0'};
-        snprintf(buf, sizeof(buf), "%.10f", value);
-        return std::string(buf);
-    }
-
-    virtual std::string to_buf() const {
-        double value = *_value() / RATIO;
-        char buf[1024] = {'\0'};
-        snprintf(buf, sizeof(buf), "%.10f", value);
-        return std::string(buf);
-    }
-
-    // 将值从字符串转æ¢åˆ°å†…部buf中
-    // å­—ç¬¦ä¸²æ ¼å¼æ˜¯ä»¥double的方å¼è¡¨ç¤º
-    virtual OLAPStatus from_string(const std::string& value_string) {
-        double double_val = atof(value_string.c_str());
-
-        if (double_val < DISCRETE_DOUBLE_MIN || double_val > DISCRETE_DOUBLE_MAX) {
-            OLAP_LOG_WARNING("value in disrete double is overflow. [value=%.10f]", double_val);
-            return OLAP_ERR_INPUT_PARAMETER_ERROR;
-        }
-
-        long value = static_cast(double_val * RATIO);
-        from_storage(reinterpret_cast(&value));
-        return OLAP_SUCCESS;
-    }
-private:
-    // 从double转æ¢ä¸ºint64_t类型时放大的比例
-    // è¯¥å€¼å†³å®šäº†å°æ•°éƒ¨åˆ†çš„精度
-    static constexpr double RATIO = 1000000.0;
-    static constexpr double DISCRETE_DOUBLE_MAX = LONG_MAX / 1000000.0;
-    static constexpr double DISCRETE_DOUBLE_MIN = LONG_MIN / 1000000.0;
-};
-
-// decimal
-class DecimalField: public BaseField {
-public:
-    DecimalField(const FieldInfo& field_info) : BaseField(field_info) {
-        _precision = field_info.precision;
-        _frac = field_info.frac;
-        _length = sizeof(int64_t) + sizeof(int32_t);
-    };
-
-    virtual void attach_field(char* buf) {
-        _is_null = buf;
-        _buf = buf + 1;
-    }
-
-    virtual void attach_buf(char* buf) {
-        _buf = buf;
-    }
-
-    bool allocate() {
-        if (Field::allocate()) {
-            attach_buf(_buf);
-            return true;
-        }
-
-        return false;
-    }
-
-    size_t mysql_size() const {
-        return sizeof(int64_t) + sizeof(int32_t);
-    }
-
-    // 把数æ®è½¬ä¸ºdouble
-    double to_double() const {
-        decimal12_t* data_ptr = _value();
-        return data_ptr->integer + double(data_ptr->fraction / decimal12_t::FRAC_RATIO);
-    }
-
-    virtual OLAPStatus from_string(const std::string& value_string) {
-        decimal12_t* data_ptr = _value();
-        return data_ptr->from_string(value_string);
-    }
-
-    virtual int real_cmp(const Field* field) const {
-        decimal12_t* data_ptr = _value();
-        decimal12_t* other = reinterpret_cast(field->buf());
-        return data_ptr->cmp(*other);
-    }
-
-    virtual void from_storage(const char* buf) {
-        memcpy(_buf, buf, _length);
-    }
-    virtual void to_storage(char* buf) const {
-        memcpy(buf, _buf, _length);
-    }
-    virtual void to_mysql(char* buf) const {
-        memcpy(buf, _buf, _length);
-    }
-
-    virtual void set_to_max() {
-        if (NULL != _buf) {
-            decimal12_t* data_ptr = _value();
-            data_ptr->integer = 999999999999999999;
-            data_ptr->fraction = 999999999;
-        }
-    }
-    virtual void set_to_min() {
-        if (NULL != _buf) {
-            decimal12_t* data_ptr = _value();
-            data_ptr->integer = -999999999999999999;
-            data_ptr->fraction = -999999999;
-        }
-    }
-    virtual bool is_min() {
-        if (NULL == _buf) {
-            return true;
-        } else if (_value()->integer == -999999999999999999
-            && _value()->fraction == -999999999){
-            return true;
-        } else {
-            return false;
-        }
-    }
-
-private:
-    // using fix ratio, 10^9
-    static const int32_t INT_STORE_BYTE = 8;
-    static const int32_t FRAC_STORE_BYTE = 4;
-    static const int32_t STORE_BYTE = INT_STORE_BYTE + FRAC_STORE_BYTE;
-
-    // frac set by user defination
-    int32_t _frac;
-    // precision set by user defination
-    int32_t _precision;
-};
-
-// è¿™é‡Œæ˜¯ä¸€ä¸ªä¼˜åŒ–çš„å®žçŽ°ï¼Œä¸»è¦æ˜¯åŸºäºŽä¸€ä¸‹å‡ ç‚¹çš„考虑
-// 1. 大部分数æ®ç±»åž‹çš„长度都是1 2 4 8
-// 2. memcpy对编译期确定长度的拷è´å®žçŽ°ä¼šå¾ˆå¿«, 大概是éžå®šé•¿çš„10å€å·¦å³
-// 3. 去掉了之å‰å®žçŽ°æ‰€å¸¦æ¥çš„虚函数调用的开销
-// 4. 对CPUçš„åˆ†æ”¯é¢„æµ‹å’ŒæŒ‡ä»¤æµæ°´çº¿å¤„ç†æ›´å‹å¥½
-// 对å•ç‰ˆæœ¬æ‰«ææ€§èƒ½çš„æ•´ä½“优化效果大概有15%å·¦å³
-void Field::copy(const Field* field) {
-
-    *_is_null = *(field->get_null());
-
-    if (OLAP_UNLIKELY(OLAP_FIELD_TYPE_VARCHAR == field->type()) || OLAP_UNLIKELY(OLAP_FIELD_TYPE_HLL == field->type())) {
-        memory_copy(_buf, field->buf(), field->size());
+// 返回-1,0,1,分别代表当å‰fieldå°äºŽï¼Œç­‰äºŽï¼Œå¤§äºŽä¼ å…¥å‚数中的field
+inline int Field::cmp(char* left, char* right) const {
+    bool l_null = *reinterpret_cast(left);
+    bool r_null = *reinterpret_cast(right);
+    if (l_null != r_null) {
+        return l_null ? -1 : 1;
     } else {
-        switch (_length) {
-        case 1:
-            *_buf = *field->buf();
-            break;
+        return l_null ? 0 : (_type_info->cmp(left + 1, right + 1));
+    }
+}
 
-        case 2:
-            *reinterpret_cast(_buf) = *reinterpret_cast(field->buf());
-            break;
+inline int Field::index_cmp(char* left, char* right) const {
+    bool l_null = *reinterpret_cast(left);
+    bool r_null = *reinterpret_cast(right);
+    if (l_null != r_null) {
+        return l_null ? -1 : 1;
+    } else if (l_null){
+        return 0;
+    }
 
-        case 4:
-            *reinterpret_cast(_buf) = *reinterpret_cast(field->buf());
-            break;
+    int32_t res = 0;
+    if (_type == OLAP_FIELD_TYPE_VARCHAR) {
+        StringSlice* l_slice = reinterpret_cast(left + 1);
+        StringSlice* r_slice = reinterpret_cast(right + 1);
 
-        case 8:
-            *reinterpret_cast(_buf) = *reinterpret_cast(field->buf());
-            break;
-
-        case 16:
-            *reinterpret_cast(_buf) = *reinterpret_cast(field->buf());
-            break;
-
-        default:
-            memory_copy(_buf, field->buf(), field->size());
-            //OLAP_LOG_DEBUG("++++ copy length: %lu", field->size());
-            break;
+        if (r_slice->size + OLAP_STRING_MAX_BYTES > _index_size) {
+            // 如果field的实际长度比short key长,则仅比较å‰ç¼€ï¼Œç¡®ä¿ç›¸åŒshort key的所有block都被扫æï¼Œ
+            // å¦åˆ™ï¼Œå¯ä»¥ç›´æŽ¥æ¯”较short keyå’Œfield
+            int compare_size = _index_size - OLAP_STRING_MAX_BYTES;
+            res = strncmp(l_slice->data, r_slice->data, compare_size);
+        } else {
+            res = l_slice->compare(*r_slice);
         }
+    } else {
+        res = _type_info->cmp(left + 1, right + 1);
     }
+
+    return res;
 }
 
-// 类似于上é¢copy的优化原ç†ï¼Œå¦å¤–还考虑到了作为Keyçš„ç±»åž‹ä¸»è¦æ˜¯4ç§UNSIGNEDçš„INT
-// 对多版本扫æçš„æ•´ä½“优化效果大概有15%å·¦å³
-int Field::cmp(const Field* field) const {
-    bool first = is_null();
-    bool second = field->is_null();
-    if (first == second && 1 == first) {
-        return 0;
-    } else if (first != second && 0 == first) {
-        return 1;
-    } else if (first != second && 1 == first) {
-        return -1;
-    }
+inline bool Field::equal(char* left, char* right) {
+    bool l_null = *reinterpret_cast(left);
+    bool r_null = *reinterpret_cast(right);
 
-    switch (_field_type) {
-    case OLAP_FIELD_TYPE_UNSIGNED_TINYINT:
-        return reinterpret_cast*>(this)->
-               BaseField::real_cmp(field);
-
-    case OLAP_FIELD_TYPE_UNSIGNED_SMALLINT:
-        return reinterpret_cast*>(this)->
-               BaseField::real_cmp(field);
-
-    case OLAP_FIELD_TYPE_UNSIGNED_INT:
-        return reinterpret_cast*>(this)->
-               BaseField::real_cmp(field);
-
-    case OLAP_FIELD_TYPE_UNSIGNED_BIGINT:
-        return reinterpret_cast*>(this)->
-               BaseField::real_cmp(field);
-
-    case OLAP_FIELD_TYPE_TINYINT:
-        return reinterpret_cast*>(this)->
-               BaseField::real_cmp(field);
-
-    case OLAP_FIELD_TYPE_SMALLINT:
-        return reinterpret_cast*>(this)->
-               BaseField::real_cmp(field);
-
-    case OLAP_FIELD_TYPE_INT:
-        return reinterpret_cast*>(this)->
-               BaseField::real_cmp(field);
-
-    case OLAP_FIELD_TYPE_BIGINT:
-        return reinterpret_cast*>(this)->
-               BaseField::real_cmp(field);
-
-    case OLAP_FIELD_TYPE_LARGEINT:
-        return reinterpret_cast*>(this)->
-               BaseField::real_cmp(field);
-
-    default:
-        return real_cmp(field);
-    }
-}
-
-// 类似于上é¢copy的优化原ç†ï¼Œå¦å¤–还考虑到了作为Keyçš„ç±»åž‹ä¸»è¦æ˜¯4ç§UNSIGNEDçš„INT
-// 对多版本扫æçš„æ•´ä½“优化效果大概有15%å·¦å³
-int Field::index_cmp(const Field* field) const {
-    bool first = is_null();
-    bool second = field->is_null();
-    if (first == second && 1 == first) {
-        return 0;
-    } else if (first != second && 0 == first) {
-        return 1;
-    } else if (first != second && 1 == first) {
-        return -1;
-    }
-
-    switch (_field_type) {
-    case OLAP_FIELD_TYPE_UNSIGNED_TINYINT:
-        return reinterpret_cast*>(this)->
-               BaseField::real_cmp(field);
-
-    case OLAP_FIELD_TYPE_UNSIGNED_SMALLINT:
-        return reinterpret_cast*>(this)->
-               BaseField::real_cmp(field);
-
-    case OLAP_FIELD_TYPE_UNSIGNED_INT:
-        return reinterpret_cast*>(this)->
-               BaseField::real_cmp(field);
-
-    case OLAP_FIELD_TYPE_UNSIGNED_BIGINT:
-        return reinterpret_cast*>(this)->
-               BaseField::real_cmp(field);
-
-    case OLAP_FIELD_TYPE_TINYINT:
-        return reinterpret_cast*>(this)->
-               BaseField::real_cmp(field);
-
-    case OLAP_FIELD_TYPE_SMALLINT:
-        return reinterpret_cast*>(this)->
-               BaseField::real_cmp(field);
-
-    case OLAP_FIELD_TYPE_INT:
-        return reinterpret_cast*>(this)->
-               BaseField::real_cmp(field);
-
-    case OLAP_FIELD_TYPE_BIGINT:
-        return reinterpret_cast*>(this)->
-               BaseField::real_cmp(field);
-
-    case OLAP_FIELD_TYPE_LARGEINT:
-        return reinterpret_cast*>(this)->
-               BaseField::real_cmp(field);
-
-    case OLAP_FIELD_TYPE_VARCHAR:
-        return reinterpret_cast(this)->short_key_cmp(field);
-
-    default:
-        return real_cmp(field);
-    }
-}
-
-bool Field::equal(const Field* field) {
-    bool first = is_null();
-    bool second = field->is_null();
-    if (first == second && 1 == first) {
+    if (l_null != r_null) {
+        return false;
+    } else if (l_null) {
         return true;
-    } else if (first != second && 0 == first) {
-        return false;
-    } else if (first != second && 1 == first) {
-        return false;
+    } else {
+        return _type_info->equal(left + 1, right + 1);
     }
-
-    return memcmp_sse(_buf, field->buf(), size()) == 0;
 }
 
-void Field::to_mysql() {
-    if (OLAP_UNLIKELY(_field_type == OLAP_FIELD_TYPE_DISCRETE_DOUBLE)) {
-        reinterpret_cast(this)->DiscreteDoubleField::to_mysql(_buf);
+inline void Field::aggregate(char* dest, char* src) {
+    _aggregate_func(dest, src);
+}
+
+inline void Field::finalize(char* data) {
+    if (OLAP_UNLIKELY(_type == OLAP_FIELD_TYPE_HLL)) {
+        // hyperloglog type use this function
+        _finalize_func(data);
     }
 }
 
+inline void Field::copy_with_pool(char* dest, const char* src, MemPool* mem_pool) {
+    bool is_null = *reinterpret_cast(src);
+    *reinterpret_cast(dest) = is_null;
+    if (is_null) {
+        return;
+    }
+    _type_info->copy_with_pool(dest + 1, src + 1, mem_pool);
+}
+
+inline void Field::copy_without_pool(char* dest, const char* src) {
+    bool is_null = *reinterpret_cast(src);
+    *reinterpret_cast(dest) = is_null;
+    if (is_null) {
+        return;
+    }
+    return _type_info->copy_without_pool(dest + 1, src + 1);
+}
+
+inline void Field::agg_init(char* dest, const char* src) {
+    if (OLAP_LIKELY(_type != OLAP_FIELD_TYPE_HLL)) {
+        copy_without_pool(dest, src);
+    } else {
+        StringSlice* slice = reinterpret_cast(dest + 1);
+        size_t hll_ptr = *(size_t*)(slice->data - sizeof(HllContext*));
+        HllContext* context = (reinterpret_cast(hll_ptr));
+        HllSetHelper::init_context(context);
+        HllSetHelper::fill_set(src + 1, context);
+        context->has_value = true;
+    }
+}
+
+inline void Field::to_index(char* dest, const char* src) {
+    bool is_null = *reinterpret_cast(src);
+    *reinterpret_cast(dest) = is_null;
+    if (is_null) {
+        return;
+    }
+
+    if (_type == OLAP_FIELD_TYPE_VARCHAR) {
+        // å…ˆæ¸…é›¶ï¼Œå†æ‹·è´
+        memset(dest + 1, 0, _index_size);
+        const StringSlice* slice = reinterpret_cast(src + 1);
+        size_t copy_size = slice->size < _index_size - OLAP_STRING_MAX_BYTES ?
+                           slice->size : _index_size - OLAP_STRING_MAX_BYTES;
+        *reinterpret_cast(dest + 1) = copy_size;
+        memory_copy(dest + OLAP_STRING_MAX_BYTES + 1, slice->data, copy_size);
+    } else if (_type == OLAP_FIELD_TYPE_CHAR) {
+        // å…ˆæ¸…é›¶ï¼Œå†æ‹·è´
+        memset(dest + 1, 0, _index_size);
+        const StringSlice* slice = reinterpret_cast(src + 1);
+        memory_copy(dest + 1, slice->data, _index_size);
+    } else {
+        memory_copy(dest + 1, src + 1, size());
+    }
+}
+
+inline uint32_t Field::hash_code(char* data, uint32_t seed) const {
+    bool is_null = (*reinterpret_cast(data) != 0);
+    if (is_null) {
+        return HashUtil::hash(&is_null, sizeof(is_null), seed);
+    }
+    return _type_info->hash_code(data + 1, seed);
+}
+
 }  // namespace palo
 
 #endif // BDG_PALO_BE_SRC_OLAP_FIELD_H
diff --git a/be/src/olap/field_info.cpp b/be/src/olap/field_info.cpp
new file mode 100644
index 0000000000..1e39e552f2
--- /dev/null
+++ b/be/src/olap/field_info.cpp
@@ -0,0 +1,232 @@
+// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include 
+#include "olap/field_info.h"
+
+using std::nothrow;
+using std::string;
+
+namespace palo {
+
+FieldType FieldInfo::get_field_type_by_string(const string& type_str) {
+    string upper_type_str = type_str;
+    std::transform(type_str.begin(), type_str.end(), upper_type_str.begin(), toupper);
+    FieldType type;
+
+    if (0 == upper_type_str.compare("TINYINT")) {
+        type = OLAP_FIELD_TYPE_TINYINT;
+    } else if (0 == upper_type_str.compare("SMALLINT")) {
+        type = OLAP_FIELD_TYPE_SMALLINT;
+    } else if (0 == upper_type_str.compare("INT")) {
+        type = OLAP_FIELD_TYPE_INT;
+    } else if (0 == upper_type_str.compare("BIGINT")) {
+        type = OLAP_FIELD_TYPE_BIGINT;
+    } else if (0 == upper_type_str.compare("LARGEINT")) {
+        type = OLAP_FIELD_TYPE_LARGEINT;
+    } else if (0 == upper_type_str.compare("UNSIGNED_TINYINT")) {
+        type = OLAP_FIELD_TYPE_UNSIGNED_TINYINT;
+    } else if (0 == upper_type_str.compare("UNSIGNED_SMALLINT")) {
+        type = OLAP_FIELD_TYPE_UNSIGNED_SMALLINT;
+    } else if (0 == upper_type_str.compare("UNSIGNED_INT")) {
+        type = OLAP_FIELD_TYPE_UNSIGNED_INT;
+    } else if (0 == upper_type_str.compare("UNSIGNED_BIGINT")) {
+        type = OLAP_FIELD_TYPE_UNSIGNED_BIGINT;
+    } else if (0 == upper_type_str.compare("FLOAT")) {
+        type = OLAP_FIELD_TYPE_FLOAT;
+    } else if (0 == upper_type_str.compare("DISCRETE_DOUBLE")) {
+        type = OLAP_FIELD_TYPE_DISCRETE_DOUBLE;
+    } else if (0 == upper_type_str.compare("DOUBLE")) {
+        type = OLAP_FIELD_TYPE_DOUBLE;
+    } else if (0 == upper_type_str.compare("CHAR")) {
+        type = OLAP_FIELD_TYPE_CHAR;
+    } else if (0 == upper_type_str.compare("DATE")) {
+        type = OLAP_FIELD_TYPE_DATE;
+    } else if (0 == upper_type_str.compare("DATETIME")) {
+        type = OLAP_FIELD_TYPE_DATETIME;
+    } else if (0 == upper_type_str.compare(0, 7, "DECIMAL")) {
+        type = OLAP_FIELD_TYPE_DECIMAL;
+    } else if (0 == upper_type_str.compare(0, 7, "VARCHAR")) {
+        type = OLAP_FIELD_TYPE_VARCHAR;
+    } else if (0 == upper_type_str.compare(0, 3, "HLL")) {
+        type = OLAP_FIELD_TYPE_HLL;
+    } else if (0 == upper_type_str.compare("STRUCT")) {
+        type = OLAP_FIELD_TYPE_STRUCT;
+    } else if (0 == upper_type_str.compare("LIST")) {
+        type = OLAP_FIELD_TYPE_LIST;
+    } else if (0 == upper_type_str.compare("MAP")) {
+        type = OLAP_FIELD_TYPE_MAP;
+    } else {
+        OLAP_LOG_WARNING("invalid type string. [type='%s']", type_str.c_str());
+        type = OLAP_FIELD_TYPE_UNKNOWN;
+    }
+
+    return type;
+}
+
+FieldAggregationMethod FieldInfo::get_aggregation_type_by_string(const string& str) {
+    string upper_str = str;
+    std::transform(str.begin(), str.end(), upper_str.begin(), toupper);
+    FieldAggregationMethod aggregation_type;
+
+    if (0 == upper_str.compare("NONE")) {
+        aggregation_type = OLAP_FIELD_AGGREGATION_NONE;
+    } else if (0 == upper_str.compare("SUM")) {
+        aggregation_type = OLAP_FIELD_AGGREGATION_SUM;
+    } else if (0 == upper_str.compare("MIN")) {
+        aggregation_type = OLAP_FIELD_AGGREGATION_MIN;
+    } else if (0 == upper_str.compare("MAX")) {
+        aggregation_type = OLAP_FIELD_AGGREGATION_MAX;
+    } else if (0 == upper_str.compare("REPLACE")) {
+        aggregation_type = OLAP_FIELD_AGGREGATION_REPLACE;
+    } else if (0 == upper_str.compare("HLL_UNION")) {
+        aggregation_type = OLAP_FIELD_AGGREGATION_HLL_UNION;
+    } else {
+        OLAP_LOG_WARNING("invalid aggregation type string. [aggregation='%s']", str.c_str());
+        aggregation_type = OLAP_FIELD_AGGREGATION_UNKNOWN;
+    }
+
+    return aggregation_type;
+}
+
+string FieldInfo::get_string_by_field_type(FieldType type) {
+    switch (type) {
+        case OLAP_FIELD_TYPE_TINYINT:
+            return "TINYINT";
+
+        case OLAP_FIELD_TYPE_UNSIGNED_TINYINT:
+            return "UNSIGNED_TINYINT";
+
+        case OLAP_FIELD_TYPE_SMALLINT:
+            return "SMALLINT";
+
+        case OLAP_FIELD_TYPE_UNSIGNED_SMALLINT:
+            return "UNSIGNED_SMALLINT";
+
+        case OLAP_FIELD_TYPE_INT:
+            return "INT";
+
+        case OLAP_FIELD_TYPE_UNSIGNED_INT:
+            return "UNSIGNED_INT";
+
+        case OLAP_FIELD_TYPE_BIGINT:
+            return "BIGINT";
+
+        case OLAP_FIELD_TYPE_LARGEINT:
+            return "LARGEINT";
+
+        case OLAP_FIELD_TYPE_UNSIGNED_BIGINT:
+            return "UNSIGNED_BIGINT";
+
+        case OLAP_FIELD_TYPE_FLOAT:
+            return "FLOAT";
+
+        case OLAP_FIELD_TYPE_DOUBLE:
+            return "DOUBLE";
+
+        case OLAP_FIELD_TYPE_DISCRETE_DOUBLE:
+            return "DISCRETE_DOUBLE";
+
+        case OLAP_FIELD_TYPE_CHAR:
+            return "CHAR";
+
+        case OLAP_FIELD_TYPE_DATE:
+            return "DATE";
+
+        case OLAP_FIELD_TYPE_DATETIME:
+            return "DATETIME";
+
+        case OLAP_FIELD_TYPE_DECIMAL:
+            return "DECIMAL";
+
+        case OLAP_FIELD_TYPE_VARCHAR:
+            return "VARCHAR";
+
+        case OLAP_FIELD_TYPE_HLL:
+            return "HLL";
+
+        case OLAP_FIELD_TYPE_STRUCT:
+            return "STRUCT";
+
+        case OLAP_FIELD_TYPE_LIST:
+            return "LIST";
+
+        case OLAP_FIELD_TYPE_MAP:
+            return "MAP";
+
+        default:
+            return "UNKNOWN";
+    }
+}
+
+string FieldInfo::get_string_by_aggregation_type(FieldAggregationMethod type) {
+    switch (type) {
+        case OLAP_FIELD_AGGREGATION_NONE:
+            return "NONE";
+
+        case OLAP_FIELD_AGGREGATION_SUM:
+            return "SUM";
+
+        case OLAP_FIELD_AGGREGATION_MIN:
+            return "MIN";
+
+        case OLAP_FIELD_AGGREGATION_MAX:
+            return "MAX";
+
+        case OLAP_FIELD_AGGREGATION_REPLACE:
+            return "REPLACE";
+
+        case OLAP_FIELD_AGGREGATION_HLL_UNION:
+            return "HLL_UNION";
+
+        default:
+            return "UNKNOWN";
+    }
+}
+
+uint32_t FieldInfo::get_field_length_by_type(TPrimitiveType::type type, uint32_t string_length) {
+    switch (type) {
+        case TPrimitiveType::TINYINT:
+            return 1;
+        case TPrimitiveType::SMALLINT:
+            return 2;
+        case TPrimitiveType::INT:
+            return 4;
+        case TPrimitiveType::BIGINT:
+            return 8;
+        case TPrimitiveType::LARGEINT:
+            return 16;
+        case TPrimitiveType::DATE:
+            return 3;
+        case TPrimitiveType::DATETIME:
+            return 8;
+        case TPrimitiveType::FLOAT:
+            return 4;
+        case TPrimitiveType::DOUBLE:
+            return 8;
+        case TPrimitiveType::CHAR:
+            return string_length;
+        case TPrimitiveType::VARCHAR:
+        case TPrimitiveType::HLL:
+            return string_length + sizeof(OLAP_STRING_MAX_LENGTH);
+        case TPrimitiveType::DECIMAL:    
+            return 12; // use 12 bytes in olap engine.
+        default:
+            OLAP_LOG_WARNING("unknown field type. [type=%d]", type);
+            return 0;
+    }
+}
+
+}  // namespace palo
diff --git a/be/src/olap/field_info.h b/be/src/olap/field_info.h
new file mode 100644
index 0000000000..64cfee03c8
--- /dev/null
+++ b/be/src/olap/field_info.h
@@ -0,0 +1,351 @@
+// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef BDG_PALO_BE_SRC_OLAP_FIELD_INFO_H
+#define BDG_PALO_BE_SRC_OLAP_FIELD_INFO_H
+
+#include "olap/olap_common.h"
+#include "olap/olap_define.h"
+#include "olap/utils.h"
+
+namespace palo {
+
+// 定义uint24_t类型和需è¦çš„一些方法
+struct uint24_t {
+public:
+    uint24_t() {
+        memset(data, 0, sizeof(data));
+    }
+
+    uint24_t(const uint24_t& value) {
+        data[0] = value.data[0];
+        data[1] = value.data[1];
+        data[2] = value.data[2];
+    }
+
+    uint24_t(const int32_t& value) {
+        data[0] = static_cast(value);
+        data[1] = static_cast(value >> 8);
+        data[2] = static_cast(value >> 16);
+    }
+
+    uint24_t& operator+=(const uint24_t& value) {
+        *this = static_cast(*this) + static_cast(value);
+        return *this;
+    }
+
+    operator int() const {
+        int value = static_cast(data[0]);
+        value += (static_cast(static_cast(data[1]))) << 8;
+        value += (static_cast(static_cast(data[2]))) << 16;
+        return value;
+    }
+
+    uint24_t& operator=(const int& value) {
+        data[0] = static_cast(value);
+        data[1] = static_cast(value >> 8);
+        data[2] = static_cast(value >> 16);
+        return *this;
+    }
+
+    uint24_t& operator=(const int64_t& value) {
+        data[0] = static_cast(value);
+        data[1] = static_cast(value >> 8);
+        data[2] = static_cast(value >> 16);
+        return *this;
+    }
+
+    bool operator==(const uint24_t& value) const {
+        return cmp(value) == 0;
+    }
+
+    bool operator!=(const uint24_t& value) const {
+        return cmp(value) != 0;
+    }
+
+    bool operator<(const uint24_t& value) const {
+        return cmp(value) < 0;
+    }
+
+    bool operator<=(const uint24_t& value) const {
+        return cmp(value) <= 0;
+    }
+
+    bool operator>(const uint24_t& value) const {
+        return cmp(value) > 0;
+    }
+
+    bool operator>=(const uint24_t& value) const {
+        return cmp(value) >= 0;
+    }
+
+    int32_t cmp(const uint24_t& other) const {
+        if (data[2] > other.data[2]) {
+            return 1;
+        } else if (data[2] < other.data[2]) {
+            return -1;
+        }
+
+        if (data[1] > other.data[1]) {
+            return 1;
+        } else if (data[1] < other.data[1]) {
+            return -1;
+        }
+
+        if (data[0] > other.data[0]) {
+            return 1;
+        } else if (data[0] < other.data[0]) {
+            return -1;
+        }
+
+        return 0;
+    }
+
+private:
+    uint8_t data[3];
+} __attribute__((packed));
+
+inline std::ostream& operator<<(std::ostream& os, const uint24_t& val) {
+    return os;
+}
+
+struct decimal12_t {
+    decimal12_t() : integer(0), fraction(0) {}
+    decimal12_t(int64_t int_part, int32_t frac_part) {
+        integer = int_part;
+        fraction = frac_part;
+    }
+
+    decimal12_t(const decimal12_t& value) {
+        integer = value.integer;
+        fraction = value.fraction;
+    }
+
+    decimal12_t& operator+=(const decimal12_t& value) {
+        fraction += value.fraction;
+        integer += value.integer;
+
+        if (fraction >= FRAC_RATIO) {
+            integer += 1;
+            fraction -= FRAC_RATIO;
+        } else if (fraction <= -FRAC_RATIO) {
+            integer -= 1;
+            fraction += FRAC_RATIO;
+        }
+
+        if (fraction * integer < 0) {
+            bool sign = integer < 0;
+            integer += (sign ? 1 : -1);
+            fraction += (sign ? -FRAC_RATIO : FRAC_RATIO);
+        }
+
+        //OLAP_LOG_WARNING("agg: int=%ld, frac=%d", integer, fraction);
+        //_set_flag();
+        return *this;
+    }
+
+    // call field::copy
+    decimal12_t& operator=(const decimal12_t& value) {
+        integer = value.integer;
+        fraction = value.fraction;
+        return *this;
+    }
+
+    bool operator<(const decimal12_t& value) const {
+        return cmp(value) < 0;
+    }
+
+    bool operator<=(const decimal12_t& value) const {
+        return cmp(value) <= 0;
+    }
+
+    bool operator>(const decimal12_t& value) const {
+        return cmp(value) > 0;
+    }
+
+    bool operator>=(const decimal12_t& value) const {
+        return cmp(value) >= 0;
+    }
+
+    bool operator==(const decimal12_t& value) const {
+        return cmp(value) == 0;
+    }
+
+    bool operator!=(const decimal12_t& value) const {
+        return cmp(value) != 0;
+    }
+
+    int32_t cmp(const decimal12_t& other) const {
+        if (integer > other.integer) {
+            return 1;
+        } else if (integer == other.integer) {
+            if (fraction > other.fraction) {
+                return 1;
+            } else if (fraction == other.fraction) {
+                return 0;
+            }
+        }
+
+        return -1;
+    }
+
+    std::string to_string() {
+        char buf[128] = {'\0'};
+
+        if (integer < 0 || fraction < 0) {
+            snprintf(buf, sizeof(buf), "-%lu.%09u",
+                     std::abs(integer), std::abs(fraction));
+        } else {
+            snprintf(buf, sizeof(buf), "%lu.%09u",
+                     std::abs(integer), std::abs(fraction));
+        }
+
+        return std::string(buf);
+    }
+
+    OLAPStatus from_string(const std::string& str) {
+        integer = 0;
+        fraction = 0;
+        const char* value_string = str.c_str();
+        const char* sign = strchr(value_string, '-');
+
+        if (sign != NULL) {
+            if (sign != value_string) {
+                return OLAP_ERR_INPUT_PARAMETER_ERROR;
+            } else {
+                ++value_string;
+            }
+        }
+
+        const char* sepr = strchr(value_string, '.');
+        if ((sepr != NULL && sepr - value_string > MAX_INT_DIGITS_NUM)
+                || (sepr == NULL && strlen(value_string) > MAX_INT_DIGITS_NUM)) {
+            integer = 999999999999999999;
+            fraction = 999999999;
+        } else {
+            if (sepr == value_string) {
+                sscanf(value_string, ".%9d", &fraction);
+                integer = 0;
+            } else {
+                sscanf(value_string, "%18ld.%9d", &integer, &fraction);
+            }
+
+            int32_t frac_len = (NULL != sepr) ?
+                               MAX_FRAC_DIGITS_NUM - strlen(sepr + 1) : MAX_FRAC_DIGITS_NUM;
+            frac_len = frac_len > 0 ? frac_len : 0;
+            fraction *= g_power_table[frac_len];
+        }
+
+        if (sign != NULL) {
+            fraction = -fraction;
+            integer = -integer;
+        }
+
+        return OLAP_SUCCESS;
+    }
+
+    static const int32_t FRAC_RATIO = 1000000000;
+    static const int32_t MAX_INT_DIGITS_NUM = 18;
+    static const int32_t MAX_FRAC_DIGITS_NUM = 9;
+
+    int64_t integer;
+    int32_t fraction;
+} __attribute__((packed));
+
+inline std::ostream& operator<<(std::ostream& os, const decimal12_t& val) {
+    return os;
+}
+
+// ä¿å­˜Field元信æ¯çš„结构体
+struct FieldInfo {
+public:
+    // åç§°
+    std::string name;
+    // æ•°æ®ç±»åž‹
+    FieldType type;
+    // èšé›†æ–¹å¼
+    FieldAggregationMethod aggregation;
+    // 长度,å•ä½ä¸ºå­—节
+    // 除字符串外,其它类型都是确定的
+    uint32_t length;
+    // å‰ç¼€ç´¢å¼•长度,如果为0,表示ä¸ä½¿ç”¨å‰ç¼€ç´¢å¼•,
+    // å¦åˆ™åˆ™æŒ‰ç…§éœ€è¦è®¾ç½®å‰ç¼€ç´¢å¼•,ç›®å‰åªå¯¹å­—符串起作用。
+    uint32_t index_length;
+    // æ˜¯å¦æ˜¯Primary Key
+    bool is_key;
+
+    bool has_default_value;
+    std::string default_value;
+
+    bool has_referenced_column;
+    std::string referenced_column;
+
+    // used to creating decimal data type
+    uint32_t precision;
+    uint32_t frac;
+
+    bool is_allow_null;
+    // 全局唯一id
+    uint32_t unique_id;
+    // å­åˆ—çš„index
+    std::vector sub_columns;
+    // æ˜¯å¦æ˜¯å…¶ä»–列的å­åˆ—
+    bool is_root_column;
+
+    // is bloom filter column
+    bool is_bf_column;
+public:
+    static std::string get_string_by_field_type(FieldType type);
+    static std::string get_string_by_aggregation_type(FieldAggregationMethod aggregation_type);
+    static FieldType get_field_type_by_string(const std::string& str);
+    static FieldAggregationMethod get_aggregation_type_by_string(const std::string& str);
+    static uint32_t get_field_length_by_type(TPrimitiveType::type type, uint32_t string_length);
+
+    OLAPStatus set_default_value(const char* str) {
+        default_value = str;
+
+        return OLAP_SUCCESS;
+    }
+
+    std::string to_string() const {
+        char buf[1024] = {'\0'};
+        snprintf(buf, sizeof(buf), "{name='%s' type='%s' aggregation='%s' length=%u is_key=%d "
+                 "is_allow_null=%d}",
+                 name.c_str(),
+                 get_string_by_field_type(type).c_str(),
+                 get_string_by_aggregation_type(aggregation).c_str(),
+                 length,
+                 is_key,
+                 is_allow_null);
+        return std::string(buf);
+    }
+
+    std::string to_json() const {
+        char buf[1024] = {'\0'};
+        snprintf(buf, sizeof(buf), "{\"name\":\"%s\",\"type\":\"%s\",\"aggregation\":\"%s\","
+                 "\"length\":%u,\"is_key\":%d,\"is_allow_null\":%d}",
+                 name.c_str(),
+                 get_string_by_field_type(type).c_str(),
+                 get_string_by_aggregation_type(aggregation).c_str(),
+                 length,
+                 is_key,
+                 is_allow_null);
+        return std::string(buf);
+    }
+};
+
+}  // namespace palo
+
+#endif // BDG_PALO_BE_SRC_OLAP_FIELD_INFO_H
diff --git a/be/src/olap/hll.cpp b/be/src/olap/hll.cpp
new file mode 100644
index 0000000000..39474ffc9a
--- /dev/null
+++ b/be/src/olap/hll.cpp
@@ -0,0 +1,230 @@
+// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "olap/hll.h"
+
+#include 
+#include 
+#include 
+#include 
+
+#include "olap/string_slice.h"
+
+using std::map;
+using std::nothrow;
+using std::string;
+using std::stringstream;
+
+namespace palo {
+
+void HllSetResolver::parse() {
+    // skip LengthValueType
+    char*  pdata = _buf_ref;
+    _set_type = (HllDataType)pdata[0];
+    char* sparse_data = NULL;
+    switch (_set_type) {
+        case HLL_DATA_EXPLICIT:
+            // first byte : type
+            // second~five byte : hash values's number
+            // five byte later : hash value
+            _expliclit_num = (ExpliclitLengthValueType) (pdata[sizeof(SetTypeValueType)]);
+            _expliclit_value = (uint64_t*)(pdata + sizeof(SetTypeValueType)
+                    + sizeof(ExpliclitLengthValueType));
+            break;
+        case HLL_DATA_SPRASE:
+            // first byte : type
+            // second ~(2^HLL_COLUMN_PRECISION)/8 byte : bitmap mark which is not zero
+            // 2^HLL_COLUMN_PRECISION)/8 + 1以åŽvalue
+            _sparse_count = (SparseLengthValueType*)(pdata + sizeof (SetTypeValueType));
+            sparse_data = pdata + sizeof(SetTypeValueType) + sizeof(SparseLengthValueType);
+            for (int i = 0; i < *_sparse_count; i++) {
+                SparseIndexType* index = (SparseIndexType*)sparse_data;
+                sparse_data += sizeof(SparseIndexType);
+                SparseValueType* value = (SparseValueType*)sparse_data;
+                _sparse_map[*index] = *value;
+                sparse_data += sizeof(SetTypeValueType);
+            }
+            break;
+        case HLL_DATA_FULL:
+            // first byte : type
+            // second byte later : hll register value
+            _full_value_position = pdata + sizeof (SetTypeValueType);
+            break;
+        default:
+            // HLL_DATA_EMPTY
+            break;
+    }
+}
+
+void HllSetResolver::fill_registers(char* registers, int len) {
+    if (_set_type == HLL_DATA_EXPLICIT) {
+        for (int i = 0; i < get_expliclit_count(); ++i) {
+            uint64_t hash_value = get_expliclit_value(i);
+            int idx = hash_value % len;
+            uint8_t first_one_bit = __builtin_ctzl(hash_value >> HLL_COLUMN_PRECISION) + 1;
+            registers[idx] = std::max((uint8_t)registers[idx], first_one_bit);
+        }
+    } else if (_set_type == HLL_DATA_SPRASE) {
+        std::map& sparse_map = get_sparse_map();
+        for (std::map::iterator iter = sparse_map.begin();
+                iter != sparse_map.end(); iter++) {
+            registers[iter->first] =
+                std::max((uint8_t)registers[iter->first], (uint8_t)iter->second);
+        }
+    } else if (_set_type == HLL_DATA_FULL) {
+        char* full_value = get_full_value();
+        for (int i = 0; i < len; i++) {
+            registers[i] = std::max((uint8_t)registers[i], (uint8_t)full_value[i]);
+        }
+
+    } else {
+        // HLL_DATA_EMPTY
+    }
+}
+
+void HllSetResolver::fill_index_to_value_map(std::map* index_to_value, int len) {
+    if (_set_type == HLL_DATA_EXPLICIT) {
+        for (int i = 0; i < get_expliclit_count(); ++i) {
+            uint64_t hash_value = get_expliclit_value(i);
+            int idx = hash_value % len;
+            uint8_t first_one_bit = __builtin_ctzl(hash_value >> HLL_COLUMN_PRECISION) + 1;
+            if (index_to_value->find(idx) != index_to_value->end()) {
+                (*index_to_value)[idx] =
+                    (*index_to_value)[idx] < first_one_bit ? first_one_bit : (*index_to_value)[idx];
+            } else {
+                (*index_to_value)[idx] = first_one_bit;
+            }
+        }
+    } else if (_set_type == HLL_DATA_SPRASE) {
+        std::map& sparse_map = get_sparse_map();
+        for (std::map::iterator iter = sparse_map.begin();
+                iter != sparse_map.end(); iter++) {
+            if (index_to_value->find(iter->first) != index_to_value->end()) {
+                (*index_to_value)[iter->first] =
+                    (*index_to_value)[iter->first]
+                        < iter->second ? iter->second : (*index_to_value)[iter->first];
+            } else {
+                (*index_to_value)[iter->first] = iter->second;
+            }
+        }
+    } else if (_set_type == HLL_DATA_FULL) {
+        char* registers = get_full_value();
+        for (int i = 0; i < len; i++) {
+            if (registers[i] != 0) {
+                if (index_to_value->find(i) != index_to_value->end()) {
+                    (*index_to_value)[i] =
+                        (*index_to_value)[i] < registers[i] ? registers[i]  : (*index_to_value)[i];
+                } else {
+                    (*index_to_value)[i] = registers[i];
+                }
+            }
+        }
+    }
+}
+
+void HllSetResolver::fill_hash64_set(std::set* hash_set) {
+    if (_set_type == HLL_DATA_EXPLICIT) {
+        for (int i = 0; i < get_expliclit_count(); ++i) {
+            uint64_t hash_value = get_expliclit_value(i);
+            hash_set->insert(hash_value);
+        }
+    }
+}
+
+void HllSetHelper::set_sparse(
+        char *result, const std::map& index_to_value, int& len) {
+    result[0] = HLL_DATA_SPRASE;
+    len = sizeof(HllSetResolver::SetTypeValueType) + sizeof(HllSetResolver::SparseLengthValueType);
+    char* write_value_pos = result + len;
+    for (std::map::const_iterator iter = index_to_value.begin();
+            iter != index_to_value.end(); iter++) {
+        write_value_pos[0] = (char)(iter->first & 0xff);
+        write_value_pos[1] = (char)(iter->first >> 8 & 0xff);
+        write_value_pos[2] = iter->second;
+        write_value_pos += 3;
+    }
+    int registers_count = index_to_value.size();
+    len += registers_count * (sizeof(HllSetResolver::SparseIndexType)
+            + sizeof(HllSetResolver::SparseValueType));
+    *(int*)(result + 1) = registers_count;
+}
+
+void HllSetHelper::set_expliclit(char* result, const std::set& hash_value_set, int& len) {
+    result[0] = HLL_DATA_EXPLICIT;
+    result[1] = (HllSetResolver::ExpliclitLengthValueType)hash_value_set.size();
+    len = sizeof(HllSetResolver::SetTypeValueType)
+        + sizeof(HllSetResolver::ExpliclitLengthValueType);
+    char* write_pos = result + len;
+    for (std::set::const_iterator iter = hash_value_set.begin();
+            iter != hash_value_set.end(); iter++) {
+        uint64_t hash_value = *iter;
+        *(uint64_t*)write_pos = hash_value;
+        write_pos += 8;
+    }
+    len += sizeof(uint64_t) * hash_value_set.size();
+}
+
+void HllSetHelper::set_full(char* result, const char* registers,
+        const int registers_len, int& len) {
+    result[0] = HLL_DATA_FULL;
+    memcpy(result + 1, registers, registers_len);
+    len = registers_len + sizeof(HllSetResolver::SetTypeValueType);
+}
+
+void HllSetHelper::set_full(char* result,
+        const std::map& index_to_value,
+        const int registers_len, int& len) {
+    result[0] = HLL_DATA_FULL;
+    for (std::map::const_iterator iter = index_to_value.begin();
+            iter != index_to_value.end(); iter++) {
+        result[1 + iter->first] = iter->second;
+    }
+    len = registers_len + sizeof(HllSetResolver::SetTypeValueType);
+}
+
+void HllSetHelper::set_max_register(char* registers, int registers_len,
+        const std::set& hash_set) {
+    for (std::set::const_iterator iter = hash_set.begin();
+            iter != hash_set.end(); iter++) {
+        uint64_t hash_value = *iter;
+        int idx = hash_value % registers_len;
+        uint8_t first_one_bit = __builtin_ctzl(hash_value >> HLL_COLUMN_PRECISION) + 1;
+        registers[idx] = std::max((uint8_t)registers[idx], first_one_bit);
+    }
+}
+
+void HllSetHelper::fill_set(const char* data, HllContext* context) {
+    HllSetResolver resolver;
+    const StringSlice* slice = reinterpret_cast(data);
+    resolver.init(slice->data, slice->size);
+    resolver.parse();
+    if (resolver.get_hll_data_type() == HLL_DATA_EXPLICIT) {
+        // expliclit set
+        resolver.fill_hash64_set(&(context->hash64_set));
+    } else if (resolver.get_hll_data_type() != HLL_DATA_EMPTY) {
+        // full or sparse
+        context->has_sparse_or_full = true;
+        resolver.fill_registers(context->registers, HLL_REGISTERS_COUNT);
+    }
+}
+
+void HllSetHelper::init_context(HllContext* context) {
+    memset(context->registers, 0, HLL_REGISTERS_COUNT);
+    context->hash64_set.clear();
+    context->has_value = false;
+    context->has_sparse_or_full = false;
+}
+
+}  // namespace palo
diff --git a/be/src/olap/hll.h b/be/src/olap/hll.h
new file mode 100644
index 0000000000..1e09f593f5
--- /dev/null
+++ b/be/src/olap/hll.h
@@ -0,0 +1,153 @@
+// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef BDG_PALO_BE_SRC_OLAP_HLL_H
+#define BDG_PALO_BE_SRC_OLAP_HLL_H
+
+#include 
+#include 
+#include 
+#include 
+
+// #include "olap/field_info.h"
+#include "olap/olap_common.h"
+// #include "olap/olap_define.h"
+
+namespace palo {
+
+const static int HLL_COLUMN_PRECISION = 14;
+const static int HLL_EXPLICLIT_INT64_NUM = 160;
+const static int HLL_REGISTERS_COUNT = 16384;
+// registers (2^14) + 1 (type)
+const static int HLL_COLUMN_DEFAULT_LEN = 16385;
+
+struct HllContext {
+    bool has_value;
+    bool has_sparse_or_full;
+    char registers[HLL_REGISTERS_COUNT];
+    std::set hash64_set;
+};
+
+// help parse hll set
+class HllSetResolver {
+public:
+    HllSetResolver() : _buf_ref(nullptr),
+                       _buf_len(0),
+                       _set_type(HLL_DATA_EMPTY),
+                       _full_value_position(nullptr),
+                       _expliclit_value(nullptr),
+                       _expliclit_num(0) {}
+
+    ~HllSetResolver() {}
+
+    typedef uint8_t SetTypeValueType;
+    typedef uint8_t ExpliclitLengthValueType;
+    typedef int32_t SparseLengthValueType;
+    typedef uint16_t SparseIndexType;
+    typedef uint8_t SparseValueType;
+
+    // only save pointer
+    void init(char* buf, int len){
+        this->_buf_ref = buf;
+        this->_buf_len = len;
+    }
+
+    // hll set type
+    HllDataType get_hll_data_type() {
+        return _set_type;
+    };
+
+    // expliclit value num
+    int get_expliclit_count() {
+        return (int)_expliclit_num;
+    };
+
+    // get expliclit index value 64bit
+    uint64_t get_expliclit_value(int index) {
+        if (index >= _expliclit_num) {
+            return -1;
+        }
+        return _expliclit_value[index];
+    };
+
+    // get expliclit index value 64bit
+    char* get_expliclit_value() {
+        return (char*)_expliclit_value;
+    };
+
+    // get full register value
+    char* get_full_value() {
+        return _full_value_position;
+    };
+
+    // get sparse (index, value) count
+    int get_sparse_count() {
+        return (int)*_sparse_count;
+    };
+
+    // get (index, value) map
+    std::map& get_sparse_map() {
+        return _sparse_map;
+    };
+
+    // parse set , call after copy() or init()
+    void parse();
+
+    // fill registers with set
+    void fill_registers(char* registers, int len);
+
+    // fill map with set
+    void fill_index_to_value_map(std::map* index_to_value, int len);
+
+    // fill hash map
+    void fill_hash64_set(std::set* hash_set);
+
+private :
+    char* _buf_ref;    // set
+    int _buf_len;      // set len
+    HllDataType _set_type;        //set type
+    char* _full_value_position;
+    uint64_t* _expliclit_value;
+    ExpliclitLengthValueType _expliclit_num;
+    std::map _sparse_map;
+    SparseLengthValueType* _sparse_count;
+};
+
+// 通过varcharçš„å˜é•¿ç¼–ç æ–¹å¼å®žçްhll集åˆ
+// 实现hll列中间计算结果的处ç†
+// empty 空集åˆ
+// expliclit 存储64ä½hash值的集åˆ
+// sparse 存储hlléž0çš„register
+// full  存储全部的hll register
+// empty -> expliclit -> sparse -> full å››ç§ç±»åž‹çš„è½¬æ¢æ–¹å‘ä¸å¯é€†
+// 第一个字节存放hll集åˆçš„类型 0:empty 1:expliclit 2:sparse 3:full
+// 已决定åŽé¢çš„æ•°æ®æ€Žä¹ˆè§£æž
+class HllSetHelper {
+public:
+    static void set_sparse(char *result, const std::map& index_to_value, int& len);
+    static void set_expliclit(char* result, const std::set& hash_value_set, int& len);
+    static void set_full(char* result, const char* registers, const int set_len, int& len);
+    static void set_full(char* result, const std::map& index_to_value,
+                         const int set_len, int& len);
+    static void set_max_register(char *registers,
+                                 int registers_len,
+                                 const std::set& hash_set);
+    static void fill_set(const char* data, HllContext* context);
+    static void init_context(HllContext* context);
+};
+
+}  // namespace palo
+
+#endif // BDG_PALO_BE_SRC_OLAP_HLL_H
diff --git a/be/src/olap/i_data.h b/be/src/olap/i_data.h
index 04f844b10c..58d31e4627 100644
--- a/be/src/olap/i_data.h
+++ b/be/src/olap/i_data.h
@@ -19,7 +19,6 @@
 #include 
 #include 
 
-#include "exprs/expr.h"
 #include "gen_cpp/olap_file.pb.h"
 #include "olap/delete_handler.h"
 #include "olap/olap_common.h"
@@ -27,6 +26,8 @@
 #include "olap/olap_index.h"
 #include "util/runtime_profile.h"
 
+#include "olap/column_predicate.h"
+
 namespace palo {
 
 class OLAPTable;
@@ -34,6 +35,7 @@ class OLAPIndex;
 class RowBlock;
 class RowCursor;
 class Conditions;
+class RuntimeState;
 
 // 抽象数æ®è®¿é—®æŽ¥å£
 // æä¾›å¯¹ä¸åŒæ•°æ®æ–‡ä»¶ç±»åž‹çš„统一访问接å£
@@ -68,19 +70,19 @@ public:
     // 下é¢è¿™äº›å‡½æ•°çš„æ³¨é‡Šè§OLAPData的注释
     virtual OLAPStatus init() = 0;
 
-    virtual void set_conjuncts(std::vector* query_conjuncts, 
-                               std::vector* delete_conjuncts) = 0;
+    // Prepre to read data from this data, after seek, block is set to the first block
+    // If start_key is nullptr, we start read from start
+    // If there is no data to read in rang (start_key, end_key), block is set to nullptr
+    // and return OLAP_ERR_DATA_EOF
+    virtual OLAPStatus prepare_block_read(
+        const RowCursor* start_key, bool find_start_key,
+        const RowCursor* end_key, bool find_end_key,
+        RowBlock** block) = 0;
 
-    virtual const RowCursor* get_first_row() = 0;
-    virtual const RowCursor* get_current_row() = 0;
-    virtual const RowCursor* get_next_row() = 0;
-
-    virtual const RowCursor* find_row(
-            const RowCursor& key,
-            bool find_last_key,
-            bool is_end_key) = 0;
-
-    virtual OLAPStatus set_end_key(const RowCursor* end_key, bool find_last_end_key) = 0;
+    // This is called after prepare_block_read, used to get next next row block if exist,
+    // 'block' is set to next block. If there is no more block, 'block' is set to nullptr
+    // with OLAP_ERR_DATA_EOF returned
+    virtual OLAPStatus get_next_block(RowBlock** row_block) = 0;
 
     // 下é¢ä¸¤ä¸ªæŽ¥å£ç”¨äºŽschema_change.cpp, æˆ‘ä»¬éœ€è¦æ”¹åŠŸèƒ½ç»§ç»­åšroll up,
     // 所以继续暴露该接å£
@@ -95,17 +97,24 @@ public:
     //   conditions - 设置查询的过滤æ¡ä»¶
     //   begin_keys - 查询会使用的begin keys
     //   end_keys - 查询会使用的end keys
-    virtual void set_read_params(const std::vector& return_columns,
-                                 const std::set& load_bf_columns,
-                                 const Conditions& conditions,
-                                 const std::vector& start_keys,
-                                 const std::vector& end_keys,
-                                 bool is_using_cache,
-                                 RuntimeState* runtime_state) {
+    virtual void set_read_params(
+            const std::vector& return_columns,
+            const std::set& load_bf_columns,
+            const Conditions& conditions,
+            const std::vector& col_predicates,
+            const std::vector& start_keys,
+            const std::vector& end_keys,
+            bool is_using_cache,
+            RuntimeState* runtime_state) {
         _conditions = &conditions;
+        _col_predicates = &col_predicates;
         _runtime_state = runtime_state;
     }
 
+    void set_stats(OlapReaderStatistics* stats) {
+        _stats = stats;
+    }
+
     virtual void set_delete_handler(const DeleteHandler& delete_handler) {
         _delete_handler = delete_handler;
     }
@@ -114,10 +123,6 @@ public:
         _delete_status = delete_status;
     }
 
-    void set_profile(RuntimeProfile* profile) {
-        _profile = profile;
-    }
-
     // å¼€æ”¾æŽ¥å£æŸ¥è¯¢_eofï¼Œè®©å¤–ç•ŒçŸ¥é“æ•°æ®è¯»å–æ˜¯å¦æ­£å¸¸ç»ˆæ­¢
     // 因为这个函数被频ç¹è®¿é—®, 从性能考虑, æ”¾åœ¨åŸºç±»è€Œä¸æ˜¯è™šå‡½æ•°
     bool eof() {
@@ -167,8 +172,8 @@ protected:
         _olap_index(olap_index),
         _eof(false),
         _conditions(NULL),
+        _col_predicates(NULL),
         _delete_status(DEL_NOT_SATISFIED),
-        _profile(NULL),
         _runtime_state(NULL) {
     }
 
@@ -178,10 +183,12 @@ protected:
     // 当到达文件末尾或者到达end key时设置此标志
     bool _eof;
     const Conditions* _conditions;
+    const std::vector* _col_predicates;
     DeleteHandler _delete_handler;
     DelCondSatisfied _delete_status;
-    RuntimeProfile* _profile;
     RuntimeState* _runtime_state;
+    OlapReaderStatistics _owned_stats;
+    OlapReaderStatistics* _stats = &_owned_stats;
 
 private:
     DISALLOW_COPY_AND_ASSIGN(IData);
diff --git a/be/src/olap/in_list_predicate.cpp b/be/src/olap/in_list_predicate.cpp
new file mode 100644
index 0000000000..06afa0191d
--- /dev/null
+++ b/be/src/olap/in_list_predicate.cpp
@@ -0,0 +1,116 @@
+// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "olap/in_list_predicate.h"
+#include "olap/field.h"
+#include "runtime/string_value.hpp"
+#include "runtime/vectorized_row_batch.h"
+
+namespace palo {
+
+#define IN_LIST_PRED_CONSTRUCTOR(CLASS) \
+template \
+CLASS::CLASS(int column_id, std::set&& values) \
+    : _column_id(column_id), \
+      _values(std::move(values)) {} \
+
+IN_LIST_PRED_CONSTRUCTOR(InListPredicate)
+IN_LIST_PRED_CONSTRUCTOR(NotInListPredicate)
+
+#define IN_LIST_PRED_EVALUATE(CLASS, OP) \
+template \
+void CLASS::evaluate(VectorizedRowBatch* batch) const { \
+    uint16_t n = batch->size(); \
+    if (n == 0) { \
+        return; \
+    } \
+    uint16_t* sel = batch->selected(); \
+    const type* col_vector = reinterpret_cast(batch->column(_column_id)->col_data()); \
+    uint16_t new_size = 0; \
+    if (batch->column(_column_id)->no_nulls()) { \
+        if (batch->selected_in_use()) { \
+            for (uint16_t j = 0; j != n; ++j) { \
+                uint16_t i = sel[j]; \
+                sel[new_size] = i; \
+                new_size += (_values.find(col_vector[i]) OP _values.end()); \
+            } \
+            batch->set_size(new_size); \
+        } else { \
+            for (uint16_t i = 0; i != n; ++i) { \
+                sel[new_size] = i; \
+                new_size += (_values.find(col_vector[i]) OP _values.end()); \
+            } \
+            if (new_size < n) { \
+                batch->set_size(new_size); \
+                batch->set_selected_in_use(true); \
+            } \
+        } \
+    } else { \
+        bool* is_null = batch->column(_column_id)->is_null(); \
+        if (batch->selected_in_use()) { \
+            for (uint16_t j = 0; j != n; ++j) { \
+                uint16_t i = sel[j]; \
+                sel[new_size] = i; \
+                new_size += (!is_null[i] && _values.find(col_vector[i]) OP _values.end()); \
+            } \
+            batch->set_size(new_size); \
+        } else { \
+            for (int i = 0; i != n; ++i) { \
+                sel[new_size] = i; \
+                new_size += (!is_null[i] && _values.find(col_vector[i]) OP _values.end()); \
+            } \
+            if (new_size < n) { \
+                batch->set_size(new_size); \
+                batch->set_selected_in_use(true); \
+            } \
+        } \
+    } \
+} \
+
+IN_LIST_PRED_EVALUATE(InListPredicate, !=)
+IN_LIST_PRED_EVALUATE(NotInListPredicate, ==)
+
+#define IN_LIST_PRED_CONSTRUCTOR_DECLARATION(CLASS) \
+    template CLASS::CLASS(int column_id, std::set&& values); \
+    template CLASS::CLASS(int column_id, std::set&& values); \
+    template CLASS::CLASS(int column_id, std::set&& values); \
+    template CLASS::CLASS(int column_id, std::set&& values); \
+    template CLASS::CLASS(int column_id, std::set&& values); \
+    template CLASS::CLASS(int column_id, std::set&& values); \
+    template CLASS::CLASS(int column_id, std::set&& values); \
+    template CLASS::CLASS(int column_id, std::set&& values); \
+    template CLASS::CLASS(int column_id, std::set&& values); \
+    template CLASS::CLASS(int column_id, std::set&& values); \
+    template CLASS::CLASS(int column_id, std::set&& values); \
+
+IN_LIST_PRED_CONSTRUCTOR_DECLARATION(InListPredicate)
+IN_LIST_PRED_CONSTRUCTOR_DECLARATION(NotInListPredicate)
+
+#define IN_LIST_PRED_EVALUATE_DECLARATION(CLASS) \
+    template void CLASS::evaluate(VectorizedRowBatch* batch) const; \
+    template void CLASS::evaluate(VectorizedRowBatch* batch) const; \
+    template void CLASS::evaluate(VectorizedRowBatch* batch) const; \
+    template void CLASS::evaluate(VectorizedRowBatch* batch) const; \
+    template void CLASS::evaluate(VectorizedRowBatch* batch) const; \
+    template void CLASS::evaluate(VectorizedRowBatch* batch) const; \
+    template void CLASS::evaluate(VectorizedRowBatch* batch) const; \
+    template void CLASS::evaluate(VectorizedRowBatch* batch) const; \
+    template void CLASS::evaluate(VectorizedRowBatch* batch) const; \
+    template void CLASS::evaluate(VectorizedRowBatch* batch) const; \
+    template void CLASS::evaluate(VectorizedRowBatch* batch) const; \
+
+IN_LIST_PRED_EVALUATE_DECLARATION(InListPredicate)
+IN_LIST_PRED_EVALUATE_DECLARATION(NotInListPredicate)
+} //namespace palo
diff --git a/be/src/olap/in_list_predicate.h b/be/src/olap/in_list_predicate.h
new file mode 100644
index 0000000000..b5b5302f80
--- /dev/null
+++ b/be/src/olap/in_list_predicate.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef BDG_PALO_BE_SRC_OLAP_IN_LIST_PREDICATE_H
+#define BDG_PALO_BE_SRC_OLAP_IN_LIST_PREDICATE_H
+
+#include 
+#include 
+#include "olap/column_predicate.h"
+
+namespace palo {
+
+class VectorizedRowBatch;
+
+#define IN_LIST_PRED_CLASS_DEFINE(CLASS) \
+template   \
+class CLASS : public ColumnPredicate { \
+public: \
+    CLASS(int column_id, std::set&& values); \
+    virtual ~CLASS() {} \
+    virtual void evaluate(VectorizedRowBatch* batch) const override; \
+private: \
+    int32_t _column_id; \
+    std::set _values; \
+}; \
+
+IN_LIST_PRED_CLASS_DEFINE(InListPredicate)
+IN_LIST_PRED_CLASS_DEFINE(NotInListPredicate)
+
+} //namespace palo
+
+#endif //BDG_PALO_BE_SRC_OLAP_IN_LIST_PREDICATE_H
diff --git a/be/src/olap/lru_cache.cpp b/be/src/olap/lru_cache.cpp
index c0e79b7e4d..3dbf1077b0 100644
--- a/be/src/olap/lru_cache.cpp
+++ b/be/src/olap/lru_cache.cpp
@@ -28,7 +28,6 @@
 #include "olap/olap_index.h"
 #include "olap/row_block.h"
 #include "olap/utils.h"
-#include "util/palo_metrics.h"
 
 using std::string;
 using std::stringstream;
@@ -237,18 +236,9 @@ Cache::Handle* LRUCache::lookup(const CacheKey& key, uint32_t hash) {
     ++_lookup_count;
     LRUHandle* e = _table.lookup(key, hash);
 
-    if (PaloMetrics::olap_lru_cache_lookup_count() != NULL) {
-        PaloMetrics::olap_lru_cache_lookup_count()->increment(1);
-    }
-
     if (e != NULL) {
         ++_hit_count;
         _ref(e);
-
-        if (PaloMetrics::olap_lru_cache_hit_count() != NULL) {
-            PaloMetrics::olap_lru_cache_hit_count()->increment(1);
-        }
-
     }
 
     return reinterpret_cast(e);
diff --git a/be/src/olap/merger.cpp b/be/src/olap/merger.cpp
index 6efd8ba6e3..74e7fca338 100644
--- a/be/src/olap/merger.cpp
+++ b/be/src/olap/merger.cpp
@@ -162,7 +162,7 @@ OLAPStatus Merger::_merge(
     reader_params.reader_type = _reader_type;
     reader_params.olap_data_arr = olap_data_arr;
 
-    if (_reader_type == READER_BASE_EXPANSION) {
+    if (_reader_type == READER_BASE_COMPACTION) {
         reader_params.version = _index->version();
     }
 
@@ -187,7 +187,7 @@ OLAPStatus Merger::_merge(
     }
 
     bool has_error = false;
-    // We calculate selectivities only when base expansioning.
+    // We calculate selectivities only when base compactioning.
     bool need_calculate_selectivities = (_index->version().first == 0);
     RowCursor row_cursor;
 
@@ -204,7 +204,7 @@ OLAPStatus Merger::_merge(
     }
 
     bool eof = false;
-    int64_t raw_rows_read = 0;
+    MemPool* mem_pool = writer->mem_pool();
 
     // The following procedure would last for long time, half of one day, etc.
     while (!has_error) {
@@ -216,10 +216,10 @@ OLAPStatus Merger::_merge(
             has_error = true;
             break;
         }
+        row_cursor.allocate_memory_for_string_type(_table->tablet_schema(), mem_pool);
 
         // Read one row into row_cursor
-        OLAPStatus res = reader.next_row_with_aggregation(&row_cursor, &raw_rows_read, &eof);
-
+        OLAPStatus res = reader.next_row_with_aggregation(&row_cursor, &eof);
         if (OLAP_SUCCESS == res && eof) {
             OLAP_LOG_DEBUG("reader read to the end.");
             break;
@@ -233,7 +233,7 @@ OLAPStatus Merger::_merge(
         writer->next(row_cursor);
 
         if (need_calculate_selectivities) {
-            // Calculate statistics while base expansion
+            // Calculate statistics while base compaction
             if (0 != _row_count) {
                 size_t first_diff_id = 0;
 
@@ -250,7 +250,7 @@ OLAPStatus Merger::_merge(
             }
 
             // set last row for next comapration.
-            if (OLAP_SUCCESS != last_row.copy(row_cursor)) {
+            if (OLAP_SUCCESS != last_row.copy(row_cursor, mem_pool)) {
                 OLAP_LOG_WARNING("fail to copy last row.");
                 has_error = true;
                 break;
diff --git a/be/src/olap/null_predicate.cpp b/be/src/olap/null_predicate.cpp
new file mode 100644
index 0000000000..a47ae499ec
--- /dev/null
+++ b/be/src/olap/null_predicate.cpp
@@ -0,0 +1,61 @@
+// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "olap/field.h"
+#include "olap/null_predicate.h"
+#include "runtime/string_value.hpp"
+#include "runtime/vectorized_row_batch.h"
+
+namespace palo {
+
+NullPredicate::NullPredicate(int32_t column_id, bool is_null)
+    : _column_id(column_id), _is_null(is_null) {}
+
+NullPredicate::~NullPredicate() {}
+
+void NullPredicate::evaluate(VectorizedRowBatch* batch) const {
+    uint16_t n = batch->size();
+    if (n == 0) {
+        return;
+    }
+    uint16_t* sel = batch->selected();
+    bool* null_array = batch->column(_column_id)->is_null();
+    uint16_t new_size = 0;
+    if (batch->column(_column_id)->no_nulls() && _is_null) {
+        batch->set_size(new_size);
+        batch->set_selected_in_use(true);
+        return;
+    }
+
+    if (batch->selected_in_use()) {
+        for (uint16_t j = 0; j != n; ++j) {
+            uint16_t i = sel[j];
+            sel[new_size] = i;
+            new_size += (null_array[i] == _is_null); 
+        }
+        batch->set_size(new_size);
+    } else {
+        for (uint16_t i = 0; i != n; ++i) {
+            sel[new_size] = i;
+            new_size += (null_array[i] == _is_null);
+        }
+        if (new_size < n) {
+            batch->set_size(new_size);
+            batch->set_selected_in_use(true);
+        }
+    }
+}
+
+} //namespace palo
diff --git a/be/src/olap/null_predicate.h b/be/src/olap/null_predicate.h
new file mode 100644
index 0000000000..801bc3bc75
--- /dev/null
+++ b/be/src/olap/null_predicate.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef BDG_PALO_BE_SRC_OLAP_NULL_PREDICATE_H
+#define BDG_PALO_BE_SRC_OLAP_NULL_PREDICATE_H
+
+#include 
+#include "olap/column_predicate.h"
+
+namespace palo {
+
+class VectorizedRowBatch;
+
+class NullPredicate : public ColumnPredicate {
+public:
+    NullPredicate(int32_t column_id, bool is_null);
+    virtual ~NullPredicate();
+
+    virtual void evaluate(VectorizedRowBatch* batch) const override;
+private:
+    int32_t _column_id;
+    bool _is_null; //true for null, false for not null
+};
+
+} //namespace palo
+
+#endif //BDG_PALO_BE_SRC_OLAP_NULL_PREDICATE_H
diff --git a/be/src/olap/olap_common.h b/be/src/olap/olap_common.h
index b061a4cf39..ccf2c006fe 100644
--- a/be/src/olap/olap_common.h
+++ b/be/src/olap/olap_common.h
@@ -95,6 +95,7 @@ enum FieldType {
     OLAP_FIELD_TYPE_DATETIME = 15,      // MySQL_TYPE_DATETIME
     OLAP_FIELD_TYPE_DECIMAL = 16,       // DECIMAL, using different store format against MySQL
     OLAP_FIELD_TYPE_VARCHAR = 17,
+
     OLAP_FIELD_TYPE_STRUCT = 18,        // Struct
     OLAP_FIELD_TYPE_LIST = 19,          // LIST
     OLAP_FIELD_TYPE_MAP = 20,           // Map
@@ -154,8 +155,8 @@ enum PushType {
 enum ReaderType {
     READER_FETCH = 0,
     READER_ALTER_TABLE = 1,
-    READER_BASE_EXPANSION = 2,
-    READER_CUMULATIVE_EXPANSION = 3,
+    READER_BASE_COMPACTION = 2,
+    READER_CUMULATIVE_COMPACTION = 3,
     READER_CHECKSUM = 4,
 };
 
@@ -170,6 +171,7 @@ struct Vertex {
 };
 
 class Field;
+class WrapperField;
 // 包å«Version,对应的version_hashå’Œnum_segments,一般指代OLAP中存在的实体Version
 struct VersionEntity {
     VersionEntity(Version v,
@@ -198,7 +200,7 @@ struct VersionEntity {
                   size_t data_size,
                   size_t index_size,
                   bool empty,
-                  std::vector > &column_statistics) :
+                  const std::vector>& column_statistics) :
             version(v),
             version_hash(hash),
             num_segments(num_seg),
@@ -217,7 +219,32 @@ struct VersionEntity {
     size_t data_size;
     size_t index_size;
     bool empty;
-    std::vector > column_statistics;
+    std::vector> column_statistics;
+};
+
+// ReaderStatistics used to collect statistics when scan data from storage
+struct OlapReaderStatistics {
+    int64_t io_ns = 0;
+    int64_t compressed_bytes_read = 0;
+
+    int64_t decompress_ns = 0;
+    int64_t uncompressed_bytes_read = 0;
+
+    int64_t bytes_read = 0;
+
+    int64_t block_load_ns = 0;
+    int64_t blocks_load = 0;
+    int64_t block_fetch_ns = 0;
+
+    int64_t raw_rows_read = 0;
+
+    int64_t rows_vec_cond_filtered = 0;
+    int64_t vec_cond_ns = 0;
+
+    int64_t rows_stats_filtered = 0;
+    int64_t rows_del_filtered = 0;
+
+    int64_t index_load_ns = 0;
 };
 
 typedef uint32_t ColumnId;
diff --git a/be/src/olap/olap_cond.cpp b/be/src/olap/olap_cond.cpp
index e27ce80ffb..1244b2fd2e 100644
--- a/be/src/olap/olap_cond.cpp
+++ b/be/src/olap/olap_cond.cpp
@@ -22,6 +22,7 @@
 
 #include "olap/olap_define.h"
 #include "olap/utils.h"
+#include "olap/wrapper_field.h"
 
 using std::nothrow;
 using std::pair;
@@ -36,7 +37,7 @@ using palo::column_file::ColumnStatistics;
 //Condcolumn表示一列上所有æ¡ä»¶çš„集åˆã€‚
 //Conds表示一列上的å•个æ¡ä»¶.
 //对于查询æ¡ä»¶è€Œè¨€ï¼Œå„层级的æ¡ä»¶ä¹‹é—´éƒ½æ˜¯é€»è¾‘与的关系
-//对于deleteæ¡ä»¶åˆ™æœ‰ä¸åŒã€‚Condå’ŒCondcolumn之间是逻辑与的关系,而Condtion直接是逻辑或的关系。
+//对于deleteæ¡ä»¶åˆ™æœ‰ä¸åŒã€‚Condå’ŒCondcolumn之间是逻辑与的关系,而Condtion之间是逻辑或的关系。
 
 //具体到实现。
 //eval是用æ¥è¿‡æ»¤æŸ¥è¯¢æ¡ä»¶ï¼ŒåŒ…括堆rowã€blockã€version的过滤,具体使用哪一层看具体的调用地方。
@@ -110,73 +111,115 @@ static CondOp parse_op_type(const string& op) {
     return op_type;
 }
 
-Cond::Cond(const TCondition& condition)
-        : condition_string(apache::thrift::ThriftDebugString(condition)) {
-    OLAP_LOG_DEBUG("parsing expr. [cond_expr=%s]", condition_string.c_str());
-    
-    column_name = condition.column_name;
-    op = parse_op_type(condition.condition_op);
-    operands = condition.condition_values;
-
-    operand_field = NULL;
+Cond::Cond() : op(OP_NULL), operand_field(nullptr) {
 }
 
-bool Cond::validation() {
-    if (op == OP_NULL || (op != OP_IN && operands.size() != 1)) {
-        return false;
+Cond::~Cond() {
+    delete operand_field;
+    for (auto& it : operand_set) {
+        delete it;
     }
-
-    return true;
 }
 
-void Cond::finalize() {
-    if (op == OP_IN) {
-        for (FieldSet::const_iterator it = operand_set.begin(); it != operand_set.end();) {
-            const Field *tmp = *it;
-            operand_set.erase(it++);
-            SAFE_DELETE(tmp);
+OLAPStatus Cond::init(const TCondition& tcond, const FieldInfo& fi) {
+    // Parse op type
+    op = parse_op_type(tcond.condition_op);
+    if (op == OP_NULL || (op != OP_IN && tcond.condition_values.size() != 1)) {
+        OLAP_LOG_WARNING("Condition op type is invalid. [name=%s, op=%d, size=%d]",
+                         tcond.column_name.c_str(), op, tcond.condition_values.size());
+        return OLAP_ERR_INPUT_PARAMETER_ERROR;
+    }
+    if (op == OP_IS) {
+        // 'is null' or 'is not null'
+        auto operand = tcond.condition_values.begin();
+        std::unique_ptr f(WrapperField::create(fi, operand->length()));
+        if (f == nullptr) {
+            OLAP_LOG_WARNING("Create field failed. [name=%s, operand=%s, op_type=%d]",
+                             tcond.column_name.c_str(), operand->c_str(), op);
+            return OLAP_ERR_INPUT_PARAMETER_ERROR;
         }
+        if (strcasecmp(operand->c_str(), "NULL") == 0) {
+            f->set_null();
+        } else {
+            f->set_not_null();
+        }
+        operand_field = f.release();
+    } else if (op != OP_IN) {
+        auto operand = tcond.condition_values.begin();
+        std::unique_ptr f(WrapperField::create(fi, operand->length()));
+        if (f == nullptr) {
+            OLAP_LOG_WARNING("Create field failed. [name=%s, operand=%s, op_type=%d]",
+                             tcond.column_name.c_str(), operand->c_str(), op);
+            return OLAP_ERR_INPUT_PARAMETER_ERROR;
+        }
+        OLAPStatus res = f->from_string(*operand);
+        if (res != OLAP_SUCCESS) {
+            OLAP_LOG_WARNING("Create field failed. [name=%s, operand=%s, op_type=%d]",
+                             tcond.column_name.c_str(), operand->c_str(), op);
+            return res;
+        }
+        operand_field = f.release();
     } else {
-        SAFE_DELETE(operand_field);
+        for (auto& operand : tcond.condition_values) {
+            std::unique_ptr f(WrapperField::create(fi, operand.length()));
+            if (f == NULL) {
+                OLAP_LOG_WARNING("Create field failed. [name=%s, operand=%s, op_type=%d]",
+                                 tcond.column_name.c_str(), operand.c_str(), op);
+                return OLAP_ERR_INPUT_PARAMETER_ERROR;
+            }
+            OLAPStatus res = f->from_string(operand);
+            if (res != OLAP_SUCCESS) {
+                OLAP_LOG_WARNING("Create field failed. [name=%s, operand=%s, op_type=%d]",
+                                 tcond.column_name.c_str(), operand.c_str(), op);
+                return res;
+            }
+            auto insert_reslut = operand_set.insert(f.get());
+            if (!insert_reslut.second) {
+                OLAP_LOG_WARNING("Duplicate operand in in-predicate.[condition=%s]", operand.c_str());
+                // Duplicated, let unique_ptr delete field
+            } else {
+                // Normal case, release this unique_ptr
+                f.release();
+            }
+        }
     }
 
-    for (vector::const_iterator it = operand_field_buf.begin();
-            it != operand_field_buf.end(); ++it) {
-        delete [] *it;
-    }
-    operand_field_buf.clear();
-
-    operands.clear();
+    return OLAP_SUCCESS;
 }
 
-bool Cond::eval(const Field* field) const {
+bool Cond::eval(char* right) const {
     //通过å•列上的å•个查询æ¡ä»¶å¯¹row进行过滤
-    if (field == NULL) {
-        OLAP_LOG_WARNING("null operand for evaluation. [condition=%s]", condition_string.c_str());
+    if (right == NULL) {
         return false;
     }
-    if (field->is_null() && op != OP_IS) {
+    if (*reinterpret_cast(right) && op != OP_IS) {
         //任何operandå’ŒNULLçš„è¿ç®—都是false
         return false;
     }
 
     switch (op) {
     case OP_EQ:
-        return field->cmp(operand_field) == 0;
+        return operand_field->cmp(right) == 0;
     case OP_NE:
-        return field->cmp(operand_field) != 0;
+        return operand_field->cmp(right) != 0;
     case OP_LT:
-        return field->cmp(operand_field) < 0;
+        return operand_field->cmp(right) > 0;
     case OP_LE:
-        return field->cmp(operand_field) <= 0;
+        return operand_field->cmp(right) >= 0;
     case OP_GT:
-        return field->cmp(operand_field) > 0;
+        return operand_field->cmp(right) < 0;
     case OP_GE:
-        return field->cmp(operand_field) >= 0;
-    case OP_IN:
-        return operand_set.find(field) != operand_set.end();
+        return operand_field->cmp(right) <= 0;
+    case OP_IN: {
+        for (const WrapperField* field : operand_set) {
+            if (field->cmp(right) == 0) {
+                return true;
+            }
+        }
+        return false;
+    }
     case OP_IS: {
-        if (operand_field->is_null() == field->is_null()) {
+        if (operand_field->is_null() == *reinterpret_cast(right)) {
             return true;
         } else {
             return false;
@@ -188,200 +231,13 @@ bool Cond::eval(const Field* field) const {
     }
 }
 
-bool Cond::eval(const ColumnStatistics& statistic) const {
-    //通过å•列上的å•个查询æ¡ä»¶å¯¹block进行过滤。
-    if (statistic.ignored()) {
-        return true;
-    }
-
-    if (OP_IS != op && statistic.minimum()->is_null()) {
-        return true;
-    }
-
-    switch (op) {
-    case OP_EQ: {
-        return operand_field->cmp(statistic.minimum()) >= 0
-               && operand_field->cmp(statistic.maximum()) <= 0;
-    }
-    case OP_NE: {
-        return operand_field->cmp(statistic.minimum()) < 0
-               || operand_field->cmp(statistic.maximum()) > 0;
-    }
-    case OP_LT: {
-        return operand_field->cmp(statistic.minimum()) > 0;
-    }
-    case OP_LE: {
-        return operand_field->cmp(statistic.minimum()) >= 0;
-    }
-    case OP_GT: {
-        return operand_field->cmp(statistic.maximum()) < 0;
-    }
-    case OP_GE: {
-        return operand_field->cmp(statistic.maximum()) <= 0;
-    }
-    case OP_IN: {
-        FieldSet::const_iterator it = operand_set.begin();
-        for (; it != operand_set.end(); ++it) {
-            if ((*it)->cmp(statistic.minimum()) >= 0 
-                    && (*it)->cmp(statistic.maximum()) <= 0) {
-                return true;
-            }
-        }
-        break;
-    }
-    case OP_IS: {
-        if (operand_field->is_null()) {
-            if (statistic.minimum()->is_null()) {
-                return true;
-            } else {
-                return false;
-            }
-        } else {
-            if (!statistic.maximum()->is_null()) {
-                return true;
-            } else {
-                return false;
-            }
-        }
-    }
-    default:
-        break;
-    }
-
-    return false;
-}
-
-int Cond::del_eval(const ColumnStatistics& stat) const {
-    //通过å•列上的å•个删除æ¡ä»¶å¯¹block进行过滤。
-    if (stat.ignored()) {
-        //for string type, the column statistics may be not recorded in block level
-        //so it can be ignored for ColumnStatistics.
-        return DEL_PARTIAL_SATISFIED;
-    }
-
-    if (OP_IS != op) {
-        if (stat.minimum()->is_null() && stat.maximum()->is_null()) {
-            return DEL_NOT_SATISFIED;
-        } else if (stat.minimum()->is_null() && !stat.maximum()->is_null()) {
-            return DEL_PARTIAL_SATISFIED;
-        }
-    }
-
-    int ret = DEL_NOT_SATISFIED;
-    switch (op) {
-    case OP_EQ: {
-        if (operand_field->cmp(stat.minimum()) == 0
-            && operand_field->cmp(stat.maximum()) == 0){
-            ret = DEL_SATISFIED;
-        } else if (operand_field->cmp(stat.minimum()) >= 0
-            && operand_field->cmp(stat.maximum()) <= 0) {
-            ret = DEL_PARTIAL_SATISFIED;
-        } else {
-            ret = DEL_NOT_SATISFIED;
-        }
-        return ret;
-    }
-    case OP_NE: {
-        if (operand_field->cmp(stat.minimum()) == 0
-            && operand_field->cmp(stat.maximum()) == 0) {
-            ret = DEL_NOT_SATISFIED;
-        } else if (operand_field->cmp(stat.minimum()) >= 0
-            && operand_field->cmp(stat.maximum()) <= 0) {
-            ret = DEL_PARTIAL_SATISFIED;
-        } else {
-            ret = DEL_SATISFIED;
-        }
-        return ret;
-    }
-    case OP_LT: {
-        if (operand_field->cmp(stat.minimum()) <= 0) {
-            ret = DEL_NOT_SATISFIED;
-        } else if (operand_field->cmp(stat.maximum()) > 0) {
-            ret = DEL_SATISFIED;
-        } else {
-            ret = DEL_PARTIAL_SATISFIED;
-        }
-        return ret;
-    }
-    case OP_LE: {
-        if (operand_field->cmp(stat.minimum()) < 0) {
-            ret = DEL_NOT_SATISFIED;
-        } else if (operand_field->cmp(stat.maximum()) >= 0) {
-            ret = DEL_SATISFIED;
-        } else {
-            ret = DEL_PARTIAL_SATISFIED;
-        }
-        return ret;
-    }
-    case OP_GT: {
-        if (operand_field->cmp(stat.maximum()) >= 0) {
-            ret = DEL_NOT_SATISFIED;
-        } else if (operand_field->cmp(stat.minimum()) < 0) {
-            ret = DEL_SATISFIED;
-        } else {
-            ret = DEL_PARTIAL_SATISFIED;
-        }
-        return ret;
-    }
-    case OP_GE: {
-        if (operand_field->cmp(stat.maximum()) > 0) {
-            ret = DEL_NOT_SATISFIED;
-        } else if (operand_field->cmp(stat.minimum()) <= 0) {
-            ret = DEL_SATISFIED;
-        } else {
-            ret = DEL_PARTIAL_SATISFIED;
-        }
-        return ret;
-    }
-    case OP_IN: {
-        //INå’ŒOR等价,åªè¦æœ‰ä¸€ä¸ªæ“作数满足删除æ¡ä»¶å°±å¯ä»¥å…¨éƒ¨è¿‡æ»¤ï¼›
-        //有一个部分满足删除æ¡ä»¶ï¼Œå°±å¯ä»¥éƒ¨åˆ†è¿‡æ»¤
-        FieldSet::const_iterator it = operand_set.begin();
-        for (; it != operand_set.end(); ++it) {
-            if ((*it)->cmp(stat.minimum()) >= 0
-                && (*it)->cmp(stat.maximum()) <= 0) {
-                if (stat.minimum()->cmp(stat.maximum()) == 0) {
-                    ret = DEL_SATISFIED;
-                } else {
-                    ret = DEL_PARTIAL_SATISFIED;
-                }
-                break;
-            }
-        }
-        if (it == operand_set.end()) {
-            ret = DEL_NOT_SATISFIED;
-        }
-        return ret;
-    }
-    case OP_IS: {
-        if (operand_field->is_null()) {
-            if (stat.minimum()->is_null() && stat.maximum()->is_null()) {
-                ret = DEL_SATISFIED;
-            } else if (stat.minimum()->is_null() && !stat.maximum()->is_null()) {
-                ret = DEL_PARTIAL_SATISFIED;
-            } else {
-                //ä¸ä¼šå‡ºçްminä¸ä¸ºNULL,max为NULL
-                ret = DEL_NOT_SATISFIED;
-            }
-        } else {
-            if (stat.minimum()->is_null() && stat.maximum()->is_null()) {
-                ret = DEL_NOT_SATISFIED;
-            } else if (stat.minimum()->is_null() && !stat.maximum()->is_null()) {
-                ret = DEL_PARTIAL_SATISFIED;
-            } else {
-                ret = DEL_SATISFIED;
-            }
-        }
-        return ret;
-    }
-    default:
-        break;
-    }
-    return ret;
-}
-
-bool Cond::eval(const std::pair& statistic) const {
+bool Cond::eval(const std::pair& statistic) const {
     //通过å•列上的å•个查询æ¡ä»¶å¯¹version进行过滤
+    // When we apply column statistic, Field can be NULL when type is Varchar,
+    // we just ignore this cond
+    if (statistic.first == nullptr || statistic.second == nullptr) {
+        return true;
+    }
     if (OP_IS != op && statistic.first->is_null()) {
         return true;
     }
@@ -438,8 +294,16 @@ bool Cond::eval(const std::pair& statistic) const {
     return false;
 }
 
-int Cond::del_eval(const std::pair& stat) const {
+int Cond::del_eval(const std::pair& stat) const {
     //通过å•列上的å•个删除æ¡ä»¶å¯¹version进行过滤。
+    
+    // When we apply column statistics, stat maybe null.
+    if (stat.first == nullptr || stat.second == nullptr) {
+        //for string type, the column statistics may be not recorded in block level
+        //so it can be ignored for ColumnStatistics.
+        return DEL_PARTIAL_SATISFIED;
+    }
+
     if (OP_IS != op) {
         if (stat.first->is_null() && stat.second->is_null()) {
             return DEL_NOT_SATISFIED;
@@ -563,17 +427,35 @@ bool Cond::eval(const column_file::BloomFilter& bf) const {
     //通过å•列上BloomFilter对block进行过滤。
     switch (op) {
     case OP_EQ: {
-        return bf.test_bytes(operand_field->buf(), operand_field->size());
+        bool existed = false;
+        if (operand_field->is_string_type()) {
+            StringSlice* slice = (StringSlice*)(operand_field->ptr());
+            existed = bf.test_bytes(slice->data, slice->size);
+        } else {
+            existed = bf.test_bytes(operand_field->ptr(), operand_field->size());
+        }
+        return existed;
     }
     case OP_IN: {
         FieldSet::const_iterator it = operand_set.begin();
         for (; it != operand_set.end(); ++it) {
-            if (bf.test_bytes((*it)->buf(), (*it)->size())) {
-                return true;
+            bool existed = false;
+            if ((*it)->is_string_type()) {
+                StringSlice* slice = (StringSlice*)((*it)->ptr());
+                existed = bf.test_bytes(slice->data, slice->size);
+            } else {
+                existed = bf.test_bytes((*it)->ptr(), (*it)->size());
             }
+            if (existed) { return true; }
         }
         return false;
     }
+    case OP_IS: {
+        // IS [NOT] NULL can only used in to filter IS NULL predicate.
+        if (operand_field->is_null()) {
+            return bf.test_bytes(nullptr, 0);
+        }
+    }
     default:
         break;
     }
@@ -581,130 +463,30 @@ bool Cond::eval(const column_file::BloomFilter& bf) const {
     return false;
 }
 
-Field* Cond::create_field(const FieldInfo& fi) {
-    return create_field(fi, 0);
-}
-
-Field* Cond::create_field(const FieldInfo& fi, uint32_t len) {
-    Field* f = Field::create(fi);
-    if (f == NULL) {
-        OLAP_LOG_WARNING("fail to create Field Object. [type=%d]", fi.type);
-        return NULL;
+CondColumn::~CondColumn() {
+    for (auto& it : _conds) {
+        delete it;
     }
-
-    uint32_t buf_len = 0;
-    switch (fi.type) {
-        case OLAP_FIELD_TYPE_VARCHAR:
-            buf_len = std::max((uint32_t)(len + sizeof(VarCharField::LengthValueType)),
-                                  fi.length);
-            f->set_buf_size(buf_len);
-            f->set_string_length(buf_len);
-            break;
-        case OLAP_FIELD_TYPE_CHAR:
-            buf_len = std::max(len,
-                               fi.length);
-            f->set_buf_size(buf_len);
-            f->set_string_length(buf_len);
-            break;
-        default:
-            buf_len = fi.length;
-    } 
-
-    char* buf = new(nothrow) char[buf_len + sizeof(char)];
-    memset(buf, 0, buf_len + sizeof(char));
-    if (buf == NULL) {
-        OLAP_LOG_WARNING("fail to alloc memory for field::attach. [length=%u]", buf_len);
-        return NULL;
-    }
-
-    operand_field_buf.push_back(buf);
-    f->attach_field(buf);
-
-    return f;
-}
-
-CondColumn::CondColumn(const CondColumn& from) {
-    for (vector::const_iterator it = from._conds.begin(); it != from._conds.end(); ++it) {
-        _conds.push_back(*it);
-    }
-
-    _table = from._table;
-    _is_key = from._is_key;
-    _col_index = from._col_index;
 }
 
 // PRECONDITION 1. index is valid; 2. at least has one operand
-bool CondColumn::add_condition(Cond* condition) {
-    if (condition->op == OP_IS) {
-        OLAP_LOG_DEBUG("Use cond.operand_field to initialize Field Object."
-                       "[for_column=%d; operand=%s; op_type=%d]",
-                       _col_index, condition->operands.begin()->c_str(), condition->op);
-        Field* f = condition->create_field(_table->tablet_schema()[_col_index],
-                                            condition->operands.begin()->length());
-        if (f == NULL) {
-            return false;
-        }
-
-        if (0 == strcasecmp(condition->operands.begin()->c_str(), "NULL")) {
-            f->set_null();
-        } else {
-            f->set_not_null();
-        }
-        condition->operand_field = f;
-    } else if (condition->op != OP_IN) {
-        OLAP_LOG_DEBUG("Use cond.operand_field to initialize Field Object."
-                       "[for_column=%d; operand=%s; op_type=%d]",
-                       _col_index, condition->operands.begin()->c_str(), condition->op);
-
-        Field* f = condition->create_field(_table->tablet_schema()[_col_index],
-                                           condition->operands.begin()->length());
-        if (f == NULL) {
-            return false;
-        }
-
-        if (OLAP_SUCCESS != f->from_string(*(condition->operands.begin()))) {
-            return false;
-        }
-
-        condition->operand_field = f;
-    } else {
-        for (vector::iterator it = condition->operands.begin();
-                it != condition->operands.end(); ++it) {
-            OLAP_LOG_DEBUG("Use cond.operand_set to initialize Field Objects."
-                           "[for_column=%d; operands=%s]", _col_index, it->c_str());
-
-            Field* f = condition->create_field(_table->tablet_schema()[_col_index],
-                                               it->length());
-              
-            if (f == NULL) {
-                return false;
-            }
-
-            if (OLAP_SUCCESS != f->from_string(*it)) {
-                return false;
-            }
-
-            pair insert_reslut = 
-                    condition->operand_set.insert(f);
-            if (!insert_reslut.second) {
-                OLAP_LOG_WARNING("fail to insert operand set.[condition=%s]", it->c_str());
-                SAFE_DELETE(f);
-            }
-        }
+OLAPStatus CondColumn::add_cond(const TCondition& tcond, const FieldInfo& fi) {
+    std::unique_ptr cond(new Cond());
+    auto res = cond->init(tcond, fi);
+    if (res != OLAP_SUCCESS) {
+        return res;
     }
-
-    _conds.push_back(*condition);
-
-    return true;
+    _conds.push_back(cond.release());
+    return OLAP_SUCCESS;
 }
 
 bool CondColumn::eval(const RowCursor& row) const {
     //通过一列上的所有查询æ¡ä»¶å¯¹å•行数æ®è¿›è¡Œè¿‡æ»¤
-    const Field* field = row.get_field_by_index(_col_index);
-    vector::const_iterator each_cond = _conds.begin();
-    for (; each_cond != _conds.end(); ++each_cond) {
+    Field* field = const_cast(row.get_field_by_index(_col_index));
+    char* buf = field->get_field_ptr(row.get_buf());
+    for (auto& each_cond : _conds) {
         // As long as there is one condition not satisfied, we can return false
-        if (!each_cond->eval(field)) {
+        if (!each_cond->eval(buf)) {
             return false;
         }
     }
@@ -712,51 +494,9 @@ bool CondColumn::eval(const RowCursor& row) const {
     return true;
 }
 
-bool CondColumn::eval(const ColumnStatistics& statistic) const {
-    //通过一列上的所有查询æ¡ä»¶å¯¹block进行过滤
-    vector::const_iterator each_cond = _conds.begin();
-    for (; each_cond != _conds.end(); ++each_cond) {
-        if (!each_cond->eval(statistic)) {
-            return false;
-        }
-    }
-
-    return true;
-}
-
-int CondColumn::del_eval(const ColumnStatistics& col_stat) const {
-    //通过一列上的所有删除æ¡ä»¶å¯¹block进行过滤
-    int ret = DEL_NOT_SATISFIED;
-    bool del_partial_stastified = false;
-    bool del_not_stastified = false;
-
-    vector::const_iterator each_cond = _conds.begin();
-    for (; each_cond != _conds.end(); ++each_cond) {
-        int del_ret = each_cond->del_eval(col_stat);
-        if (DEL_SATISFIED == del_ret) {
-            continue;
-        } else if (DEL_PARTIAL_SATISFIED == del_ret) {
-            del_partial_stastified = true;
-        } else {
-            del_not_stastified = true;
-            break;
-        }
-    }
-    if (true == del_not_stastified || 0 == _conds.size()) {
-        ret = DEL_NOT_SATISFIED;
-    } else if (true == del_partial_stastified) {
-        ret = DEL_PARTIAL_SATISFIED;
-    } else {
-        ret = DEL_SATISFIED;
-    }
-    return ret;
-
-}
-
-bool CondColumn::eval(const std::pair &statistic) const {
+bool CondColumn::eval(const std::pair &statistic) const {
     //通过一列上的所有查询æ¡ä»¶å¯¹version进行过滤
-    vector::const_iterator each_cond = _conds.begin();
-    for (; each_cond != _conds.end(); ++each_cond) {
+    for (auto& each_cond : _conds) {
         if (!each_cond->eval(statistic)) {
             return false;
         }
@@ -765,7 +505,7 @@ bool CondColumn::eval(const std::pair &statistic) const {
     return true;
 }
 
-int CondColumn::del_eval(const std::pair& statistic) const {
+int CondColumn::del_eval(const std::pair& statistic) const {
     //通过一列上的所有删除æ¡ä»¶å¯¹version进行过滤
 
     /*
@@ -777,8 +517,7 @@ int CondColumn::del_eval(const std::pair& statistic) const {
     int ret = DEL_NOT_SATISFIED;
     bool del_partial_statified = false;
     bool del_not_statified = false; 
-    vector::const_iterator each_cond = _conds.begin();
-    for (; each_cond != _conds.end(); ++each_cond) {
+    for (auto& each_cond : _conds) {
         int del_ret = each_cond->del_eval(statistic);
         if (DEL_SATISFIED == del_ret) {
             continue;
@@ -804,8 +543,7 @@ int CondColumn::del_eval(const std::pair& statistic) const {
 
 bool CondColumn::eval(const column_file::BloomFilter& bf) const {
     //通过一列上的所有BloomFilter索引信æ¯å¯¹block进行过滤
-    vector::const_iterator each_cond = _conds.begin();
-    for (; each_cond != _conds.end(); ++each_cond) {
+    for (auto& each_cond : _conds) {
         if (!each_cond->eval(bf)) {
             return false;
         }
@@ -814,57 +552,31 @@ bool CondColumn::eval(const column_file::BloomFilter& bf) const {
     return true;
 }
 
-void CondColumn::finalize() {
-    for (vector::iterator it = _conds.begin(); it != _conds.end(); ++it) {
-        it->finalize();
-    }
-}
-
-OLAPStatus Conditions::append_condition(const TCondition& condition) {
-    if (_table == NULL) {
-        OLAP_LOG_WARNING("fail to parse condition without any table attached. [condition=%s]",
-                         apache::thrift::ThriftDebugString(condition).c_str());
-        return OLAP_ERR_NOT_INITED;
-    }
-
-    // Parse triplet for condition
-    Cond cond(condition);
-    if (!cond.validation()) {
-        OLAP_LOG_WARNING("fail to parse condition, invalid condition format. [condition=%s]",
-                         apache::thrift::ThriftDebugString(condition).c_str());
-        return OLAP_ERR_INPUT_PARAMETER_ERROR;
-    }
-
-    int32_t index = _table->get_field_index(cond.column_name);
+OLAPStatus Conditions::append_condition(const TCondition& tcond) {
+    int32_t index = _table->get_field_index(tcond.column_name);
     if (index < 0) {
         OLAP_LOG_WARNING("fail to get field index, name is invalid. [index=%d; field_name=%s]",
                          index,
-                         cond.column_name.c_str());
+                         tcond.column_name.c_str());
         return OLAP_ERR_INPUT_PARAMETER_ERROR;
     }
 
     // Skip column which is non-key, or whose type is string or float
-    FieldInfo fi = _table->tablet_schema()[index];
+    const FieldInfo& fi = _table->tablet_schema()[index];
     if (fi.type == OLAP_FIELD_TYPE_DOUBLE || fi.type == OLAP_FIELD_TYPE_FLOAT) {
         return OLAP_SUCCESS;
     }
 
-    if (_columns.count(index) != 0) {
-        if (!_columns[index].add_condition(&cond)) {
-            OLAP_LOG_WARNING("fail to add condition for column. [field_index=%d]", index);
-            return OLAP_ERR_INPUT_PARAMETER_ERROR;
-        }
+    CondColumn* cond_col = nullptr;
+    auto it = _columns.find(index);
+    if (it == _columns.end()) {
+        cond_col = new CondColumn(_table, index);
+        _columns[index] = cond_col;
     } else {
-        CondColumn cc(_table, index);
-        if (!cc.add_condition(&cond)) {
-            OLAP_LOG_WARNING("fail to add condition for column. [field_index=%d]", index);
-            return OLAP_ERR_INPUT_PARAMETER_ERROR;
-        }
-
-        _columns[index] = cc;
+        cond_col = it->second;
     }
 
-    return OLAP_SUCCESS;
+    return cond_col->add_cond(tcond, fi);
 }
 
 bool Conditions::delete_conditions_eval(const RowCursor& row) const {
@@ -873,9 +585,8 @@ bool Conditions::delete_conditions_eval(const RowCursor& row) const {
         return false;
     }
     
-    for (CondColumns::const_iterator each_cond = _columns.begin();
-            each_cond != _columns.end(); ++each_cond) {
-        if (each_cond->second.is_key() && !each_cond->second.eval(row)) {
+    for (auto& each_cond : _columns) {
+        if (each_cond.second->is_key() && !each_cond.second->eval(row)) {
             return false;
         }
     }
@@ -888,18 +599,17 @@ bool Conditions::delete_conditions_eval(const RowCursor& row) const {
 }
 
 bool Conditions::delta_pruning_filter(
-        std::vector> &column_statistics) const {
+        const std::vector>& column_statistics) const {
     //通过所有列上的删除æ¡ä»¶å¯¹version进行过滤
-    for (CondColumns::const_iterator cond_it = _columns.begin(); 
-            cond_it != _columns.end(); ++cond_it) {
-        if (cond_it->second.is_key() && cond_it->first > column_statistics.size()) {
+    for (auto& cond_it : _columns) {
+        if (cond_it.second->is_key() && cond_it.first > column_statistics.size()) {
             OLAP_LOG_WARNING("where condition not equal column statistics size."
                     "[cond_id=%d, column_statistics_size=%lu]", 
-                    cond_it->first,
+                    cond_it.first,
                     column_statistics.size());
             return false;
         }
-        if (cond_it->second.is_key() && !cond_it->second.eval(column_statistics[cond_it->first])) {
+        if (cond_it.second->is_key() && !cond_it.second->eval(column_statistics[cond_it.first])) {
             return true;
         }
     }
@@ -907,7 +617,7 @@ bool Conditions::delta_pruning_filter(
 }
 
 int Conditions::delete_pruning_filter(
-        std::vector> & col_stat) const {
+        const std::vector>& col_stat) const {
 
     //通过所有列上的删除æ¡ä»¶å¯¹version进行过滤
     /*
@@ -919,23 +629,21 @@ int Conditions::delete_pruning_filter(
     int ret = DEL_NOT_SATISFIED;
     bool del_partial_satisfied = false;
     bool del_not_satisfied = false;
-    CondColumns::const_iterator cond_it = _columns.begin();
-    for (; cond_it != _columns.end(); ++cond_it) {
+    for (auto& cond_it : _columns) {
         /*
          * this is base on the assumption that the delete condition
          * is only about key field, not about value field.
         */
-        if (cond_it->second.is_key() && cond_it->first > col_stat.size()) {
+        if (cond_it.second->is_key() && cond_it.first > col_stat.size()) {
             OLAP_LOG_WARNING("where condition not equal column statistics size."
                     "[cond_id=%d, column_statistics_size=%lu]", 
-                    cond_it->first,
+                    cond_it.first,
                     col_stat.size());
             del_partial_satisfied = true;
             continue;
         }
 
-        std::pair stat = col_stat[cond_it->first]; 
-        int del_ret = cond_it->second.del_eval(stat);
+        int del_ret = cond_it.second->del_eval(col_stat[cond_it.first]);
         if (DEL_SATISFIED == del_ret) {
             continue;
         } else if (DEL_PARTIAL_SATISFIED == del_ret) {
diff --git a/be/src/olap/olap_cond.h b/be/src/olap/olap_cond.h
index 74d8afea95..8ecc6fb05c 100644
--- a/be/src/olap/olap_cond.h
+++ b/be/src/olap/olap_cond.h
@@ -30,6 +30,9 @@
 #include "olap/row_cursor.h"
 
 namespace palo {
+
+class WrapperField;
+
 enum CondOp {
     OP_EQ = 0,      // equal
     OP_NE = 1,      // not equal
@@ -44,14 +47,14 @@ enum CondOp {
 
 // Hash functor for IN set
 struct FieldHash {
-    size_t operator()(const Field* field) const {
-        return std::hash()(std::string(field->buf(), field->size()));
+    size_t operator()(const WrapperField* field) const {
+        return field->hash_code();
     }
 };
 
 // Equal function for IN set
 struct FieldEqual {
-    bool operator()(const Field* left, const Field* right) const {
+    bool operator()(const WrapperField* left, const WrapperField* right) const {
         return left->cmp(right) == 0;
     }
 };
@@ -59,86 +62,62 @@ struct FieldEqual {
 // æ¡ä»¶äºŒå…ƒç»„,æè¿°äº†ä¸€ä¸ªæ¡ä»¶çš„æ“ä½œç±»åž‹å’Œæ“作数(1个或者多个)
 struct Cond {
 public:
-    typedef std::unordered_set FieldSet;
+    Cond();
+    ~Cond();
 
-    Cond(const TCondition& condition);
+    OLAPStatus init(const TCondition& tcond, const FieldInfo& fi);
     
-    // Check whehter this condition is valid
-    // Valid condition:
-    // 1) 'op' is not null
-    // 2) if 'op' is not IN, it should have only one operand
-    bool validation();
-    
-    void finalize();
     // 用一行数æ®çš„æŒ‡å®šåˆ—åŒæ¡ä»¶è¿›è¡Œæ¯”较,如果符åˆè¿‡æ»¤æ¡ä»¶ï¼Œ
     // 峿Œ‰ç…§æ­¤æ¡ä»¶ï¼Œè¡Œåº”被过滤掉,则返回true,å¦åˆ™è¿”回false
-    bool eval(const Field* field) const;
+    bool eval(char* right) const;
     
-    bool eval(const column_file::ColumnStatistics& statistic) const;
-    int del_eval(const column_file::ColumnStatistics& stat) const;
-
-    bool eval(const std::pair& statistic) const;
-    int del_eval(const std::pair& stat) const;
+    bool eval(const std::pair& statistic) const;
+    int del_eval(const std::pair& stat) const;
 
     bool eval(const column_file::BloomFilter& bf) const;
     
-    // å°è£…Field::create以åŠåˆ†é…attach使用的buffer
-    Field* create_field(const FieldInfo& fi);
-
-    Field* create_field(const FieldInfo& fi, uint32_t len);
-
-    CondOp                      op;
-    std::string                 column_name;
-    std::string                 condition_string;
-    std::vector    operands;         // 所有æ“作数的字符表示
-    Field*                      operand_field;    // å¦‚æžœä¸æ˜¯OP_IN, 此处ä¿å­˜å”¯ä¸€æ“作数
-    FieldSet                    operand_set;      // 如果是OP_IN,此处为IN的集åˆ
-
-private:
-    std::vector         operand_field_buf;  // buff for field.attach
+    CondOp op;
+    // valid when op is not OP_IN
+    WrapperField* operand_field;
+    // valid when op is OP_IN
+    typedef std::unordered_set FieldSet;
+    FieldSet operand_set;
 };
 
 // 所有归属于åŒä¸€åˆ—上的æ¡ä»¶äºŒå…ƒç»„,èšåˆåœ¨ä¸€ä¸ªCondColumn上
 class CondColumn {
 public:
-    CondColumn() : _is_key(true), _col_index(0) {}
-    
     CondColumn(SmartOLAPTable table, int32_t index) : _col_index(index), _table(table) {
         _conds.clear();
         _is_key = _table->tablet_schema()[_col_index].is_key;
     }
-
-    CondColumn(const CondColumn& from);
+    ~CondColumn();
 
     // Convert condition's operand from string to Field*, and append this condition to _conds
     // return true if success, otherwise return false
     bool add_condition(Cond* condition);
+    OLAPStatus add_cond(const TCondition& tcond, const FieldInfo& fi);
 
     // 对一行数æ®ä¸­çš„æŒ‡å®šåˆ—,用所有过滤æ¡ä»¶è¿›è¡Œæ¯”较,如果所有æ¡ä»¶éƒ½æ»¡è¶³ï¼Œåˆ™è¿‡æ»¤æ­¤è¡Œ
     bool eval(const RowCursor& row) const;
-    
-    bool eval(const column_file::ColumnStatistics& statistic) const;
-    int del_eval(const column_file::ColumnStatistics& col_stat) const;
 
-    bool eval(const std::pair& statistic) const;
-    int del_eval(const std::pair& statistic) const;
+    bool eval(const std::pair& statistic) const;
+    int del_eval(const std::pair& statistic) const;
 
     bool eval(const column_file::BloomFilter& bf) const;
 
-    void finalize();
-
     inline bool is_key() const {
         return _is_key;
     }
 
-    const std::vector& conds() const {
+    const std::vector& conds() const {
         return _conds;
     }
 
 private:
     bool                _is_key;
     int32_t             _col_index;
-    std::vector   _conds;
+    std::vector   _conds;
     SmartOLAPTable      _table;
 };
 
@@ -147,22 +126,13 @@ class Conditions {
 public:
     // Key: field index of condition's column
     // Value: CondColumn object
-    typedef std::map CondColumns;
+    typedef std::map CondColumns;
 
     Conditions() {}
 
-    Conditions& operator=(const Conditions& conds) {
-        if (&conds != this) {
-            _columns = conds._columns;
-            _table = conds._table;
-        }
-
-        return *this;
-    }
-
     void finalize() {
-        for (CondColumns::iterator it = _columns.begin(); it != _columns.end(); ++it) {
-            it->second.finalize();
+        for (auto& it : _columns) {
+            delete it.second;
         }
         _columns.clear();
     }
@@ -181,15 +151,11 @@ public:
     OLAPStatus append_condition(const TCondition& condition);
     
     bool delete_conditions_eval(const RowCursor& row) const;
-
-    int delete_conditions_eval(const column_file::ColumnStatistics& col_stat) const;
     
-    bool where_conditions_eval(uint32_t field_index,
-                               const column_file::ColumnStatistics& statistic) const;
-
-    bool delta_pruning_filter(std::vector> &column_statistics) const;
-    int delete_pruning_filter(std::vector> &column_statistics) const;
-
+    bool delta_pruning_filter(
+        const std::vector>& column_statistics) const;
+    int delete_pruning_filter(
+        const std::vector>& column_statistics) const;
 
     const CondColumns& columns() const {
         return _columns;
diff --git a/be/src/olap/olap_data.cpp b/be/src/olap/olap_data.cpp
index e0b770a03d..56d23a7a02 100644
--- a/be/src/olap/olap_data.cpp
+++ b/be/src/olap/olap_data.cpp
@@ -74,11 +74,6 @@ OLAPStatus OLAPData::init() {
     return unpickle();
 }
 
-void OLAPData::set_conjuncts(std::vector* query_conjuncts, 
-                             std::vector* delete_conjuncts) {
-    _row_block_broker->set_conjuncts(query_conjuncts, delete_conjuncts);
-}
-
 OLAPStatus OLAPData::get_first_row_block(RowBlock** row_block,
                                      const char** packed_row_block,
                                      uint32_t* packed_row_block_size) {
@@ -105,7 +100,7 @@ OLAPStatus OLAPData::get_first_row_block(RowBlock** row_block,
         return res;
     }
 
-    res = _row_block_broker->change_to(row_block_pos, _profile);
+    res = _row_block_broker->change_to(row_block_pos);
     if (res != OLAP_SUCCESS) {
         OLAP_LOG_WARNING("Fail to get row block. "
                          "[segment=%d, block_size=%d, data_offset=%d, index_offset=%d]",
@@ -116,6 +111,7 @@ OLAPStatus OLAPData::get_first_row_block(RowBlock** row_block,
         _check_io_error(res);
         return res;
     }
+    _stats->raw_rows_read += _row_block_broker->num_rows();
 
     (row_block == NULL || (*row_block = _row_block_broker->row_block()));
     (packed_row_block == NULL || (*packed_row_block = _row_block_broker->packed_row_block()));
@@ -176,7 +172,7 @@ OLAPStatus OLAPData::get_next_row_block(RowBlock** row_block,
 
         return OLAP_ERR_INDEX_EOF;
     }
-    res = _row_block_broker->change_to(row_block_pos, _profile);
+    res = _row_block_broker->change_to(row_block_pos);
     if (res != OLAP_SUCCESS) {
         OLAP_LOG_WARNING("Fail to get row block. "
                          "[segment=%d, block_size=%d, data_offset=%d, index_offset=%d]",
@@ -188,6 +184,7 @@ OLAPStatus OLAPData::get_next_row_block(RowBlock** row_block,
         _check_io_error(res);
         return res;
     }
+    _stats->raw_rows_read += _row_block_broker->num_rows();
 
     (row_block == NULL || (*row_block = _row_block_broker->row_block()));
     (packed_row_block == NULL || (*packed_row_block = _row_block_broker->packed_row_block()));
@@ -214,7 +211,7 @@ RowBlock* OLAPData::seek_and_get_row_block(const RowBlockPosition& position) {
     }
 
     OLAPStatus res = OLAP_SUCCESS;
-    if ((res = _row_block_broker->change_to(position, _profile)) != OLAP_SUCCESS) {
+    if ((res = _row_block_broker->change_to(position)) != OLAP_SUCCESS) {
         OLAP_LOG_WARNING("Fail to get row block. "
                          "[segment=%d, block_size=%d, data_offset=%d, index_offset=%d]",
                          position.segment,
@@ -225,6 +222,7 @@ RowBlock* OLAPData::seek_and_get_row_block(const RowBlockPosition& position) {
         _check_io_error(res);
         return NULL;
     }
+    _stats->raw_rows_read += _row_block_broker->num_rows();
 
     return _row_block_broker->row_block();
 }
@@ -249,7 +247,7 @@ const RowCursor* OLAPData::get_first_row() {
 
     OLAP_LOG_DEBUG("RowBlockPosition='%s'", row_block_pos.to_string().c_str());
 
-    if ((res = _row_block_broker->change_to(row_block_pos, _profile)) != OLAP_SUCCESS) {
+    if ((res = _row_block_broker->change_to(row_block_pos)) != OLAP_SUCCESS) {
         OLAP_LOG_WARNING("Fail to get row block. "
                          "[segment=%d, block_size=%d, data_offset=%d, index_offset=%d]",
                          row_block_pos.segment,
@@ -260,22 +258,11 @@ const RowCursor* OLAPData::get_first_row() {
         _check_io_error(res);
         return NULL;
     }
+    _stats->raw_rows_read += _row_block_broker->num_rows();
 
     return _row_block_broker->first();
 }
 
-const RowCursor* OLAPData::get_current_row() {
-    set_eof(false);
-
-    if (!_row_block_broker) {
-        OLAP_LOG_FATAL("using pickled OLAPData is forbidden.");
-        return NULL;
-    }
-
-    // 这里没有强é™åˆ¶,必须在调用其他的获å–get rowçš„æ–¹æ³•ä¹‹åŽæ‰èƒ½ä½¿ç”¨æ­¤æ–¹æ³•
-    return _row_block_broker->current();
-}
-
 const RowCursor* OLAPData::get_next_row() {
     set_eof(false);
 
@@ -408,7 +395,7 @@ const RowCursor* OLAPData::find_row(const RowCursor& key, bool find_last_key, bo
 
     while (end_position >= start_position && !data_eof) {
         // æ ¹æ®poså–到对应的row_block
-        if ((res = _row_block_broker->change_to(start_position, _profile)) != OLAP_SUCCESS) {
+        if ((res = _row_block_broker->change_to(start_position)) != OLAP_SUCCESS) {
             OLAP_LOG_WARNING("Fail to get row block. "
                              "[segment=%d, block_size=%d, data_offset=%d, index_offset=%d]",
                              start_position.segment,
@@ -419,6 +406,7 @@ const RowCursor* OLAPData::find_row(const RowCursor& key, bool find_last_key, bo
             _check_io_error(res);
             return NULL;
         }
+        _stats->raw_rows_read += _row_block_broker->num_rows();
 
         // eofä»£è¡¨è¿™ä¸€å—æ‰¾å®Œäº†ï¼Œä»ç„¶æ²¡æœ‰å‘现key,但也å¯èƒ½æ˜¯æ‰¾åˆ°äº†endkey,也就是说
         // 这个数æ®ä¸­æ²¡æœ‰éœ€è¦çš„key。
@@ -508,7 +496,7 @@ OLAPStatus OLAPData::unpickle() {
                                        _session_status->end_row_index);
         _row_block_broker->set_end_row_flag(_session_status->is_set_end_row);
 
-        res = _row_block_broker->change_to(_session_status->position, _profile);
+        res = _row_block_broker->change_to(_session_status->position);
         if (res != OLAP_SUCCESS) {
             OLAP_LOG_WARNING("fail to get row block. "
                              "[res=%d segment=%d block_size=%d data_offset=%d index_offset=%d]",
@@ -522,6 +510,7 @@ OLAPStatus OLAPData::unpickle() {
             _check_io_error(res);
             return OLAP_ERR_DATA_ROW_BLOCK_ERROR;
         }
+        _stats->raw_rows_read += _row_block_broker->num_rows();
 
         _row_block_broker->get_row(_session_status->row_index);
     }
@@ -606,115 +595,45 @@ ADD_SEGMENT_ERR:
     return res;
 }
 
-OLAPStatus OLAPData::add_packed_row_block(const RowBlock* row_block,
-                                      const char* packed_row_block,
-                                      uint32_t packed_row_block_size,
-                                      uint32_t* start_data_offset,
-                                      uint32_t* end_data_offset) {
+OLAPStatus OLAPData::add_row_block(RowBlock* row_block,
+                                   uint32_t* start_data_offset,
+                                   uint32_t* end_data_offset) {
     if (!_write_descriptor) {
         OLAP_LOG_WARNING("segment should be added before.");
         return OLAP_ERR_NOT_INITED;
     }
 
     OLAPStatus res = OLAP_SUCCESS;
-    RowBlockHeaderV2 row_block_header;
 
-    memory_copy(_write_descriptor->packed_buffer, packed_row_block, packed_row_block_size);
-
-    RowBlockInfo rb_info = row_block->row_block_info();
     // 返回RowBlockèµ·å§‹ä½ç½®çš„Offset
     off_t offset = _write_descriptor->file_handle.tell();
     if (offset == -1) {
-        OLAP_LOG_WARNING("fail to tell file. [err=%m]");
         res = OLAP_ERR_IO_ERROR;
-        goto ADD_PACKED_ROW_BLOCK_ERROR;
+        _check_io_error(res);
+        return res;
     }
 
     (start_data_offset == NULL || (*start_data_offset = static_cast(offset)));
 
-    // æ›´æ–°RowBlockHeader
-    row_block_header.packed_len = static_cast(packed_row_block_size);
-    row_block_header.num_rows = rb_info.row_num;
-    row_block_header.checksum = rb_info.checksum;
-    //新增内容,包括魔数,版本和未打包内容的大å°ï¼Œä¾¿äºŽåŽç»­è§£åŽ‹
-    row_block_header.magic_num = 0;
-    row_block_header.version = 1;
-    row_block_header.unpacked_len = row_block->used_buf_len();
-    
-    res = _write_descriptor->file_handle.write(&row_block_header, sizeof(row_block_header));
-    if (res != OLAP_SUCCESS) {
-        OLAP_LOG_WARNING("fail to dump row block header. [size=%lu]", sizeof(row_block_header));
-        goto ADD_PACKED_ROW_BLOCK_ERROR;
-    }
-    
-    res = _write_descriptor->file_handle.write(_write_descriptor->packed_buffer,
-                                               packed_row_block_size);
-    if (res != OLAP_SUCCESS) {
-        OLAP_LOG_WARNING("fail to dump row block data. [size=%u]", packed_row_block_size);
-        goto ADD_PACKED_ROW_BLOCK_ERROR;
-    }
-
-    // æ›´æ–°SegmentHeader的校验ç ï¼Œè®¡ç®—RowBlockæ•°æ®éƒ¨åˆ†
-    _write_descriptor->checksum = olap_crc32(_write_descriptor->checksum,
-                                             _write_descriptor->packed_buffer,
-                                             packed_row_block_size);
-    // 返回RowBlock结æŸä½ç½®çš„Offset
-    offset = _write_descriptor->file_handle.tell();
-    if (offset == -1) {
-        res = OLAP_ERR_IO_ERROR;
-        goto ADD_PACKED_ROW_BLOCK_ERROR;
-    }
-
-    (end_data_offset == NULL || (*end_data_offset = static_cast(offset)));
-
-    return OLAP_SUCCESS;
-
-ADD_PACKED_ROW_BLOCK_ERROR:
-    _check_io_error(res);
-
-    return res;
-}
-
-OLAPStatus OLAPData::add_row_block(const RowBlock& row_block,
-                               uint32_t* start_data_offset,
-                               uint32_t* end_data_offset) {
-    if (!_write_descriptor) {
-        OLAP_LOG_WARNING("segment should be added before.");
-        return OLAP_ERR_NOT_INITED;
-    }
-
-    OLAPStatus res = OLAP_SUCCESS;
-    RowBlockHeaderV2 row_block_header;
     size_t packed_size = 0;
-    RowBlockInfo rb_info = row_block.row_block_info();
-
-    // 返回RowBlockèµ·å§‹ä½ç½®çš„Offset
-    off_t offset = _write_descriptor->file_handle.tell();
-    if (offset == -1) {
-        res = OLAP_ERR_IO_ERROR;
-        goto ADD_ROW_BLOCK_ERROR;
-    }
-
-    (start_data_offset == NULL || (*start_data_offset = static_cast(offset)));
-
     // 使用LZO1C-99压缩RowBlock
-    if (row_block.compress(_write_descriptor->packed_buffer,
-                           OLAP_DEFAULT_MAX_PACKED_ROW_BLOCK_SIZE,
-                           &packed_size,
-                           OLAP_COMP_STORAGE) != OLAP_SUCCESS) {
+    if (row_block->serialize_to_row_format(_write_descriptor->packed_buffer,
+                                           OLAP_DEFAULT_MAX_PACKED_ROW_BLOCK_SIZE,
+                                           &packed_size,
+                                           OLAP_COMP_STORAGE) != OLAP_SUCCESS) {
         OLAP_LOG_WARNING("Fail to compress row block.");
         return OLAP_ERR_COMPRESS_ERROR;
     }
 
-    // æ›´æ–°RowBlockHeader
+    // RowBlockInfo is valid only after serialize is called
+    const RowBlockInfo& rb_info = row_block->row_block_info();
+    RowBlockHeaderV2 row_block_header;
     row_block_header.packed_len = static_cast(packed_size);
     row_block_header.num_rows = rb_info.row_num;
     row_block_header.checksum = rb_info.checksum;
-
-    // 新增
     row_block_header.magic_num = 0;
     row_block_header.version = 1;
-    row_block_header.unpacked_len = row_block.used_buf_len();
+    row_block_header.unpacked_len = rb_info.unpacked_len;
     
     res = _write_descriptor->file_handle.write(&row_block_header, sizeof(row_block_header));
     if (res != OLAP_SUCCESS) {
@@ -736,17 +655,13 @@ OLAPStatus OLAPData::add_row_block(const RowBlock& row_block,
     offset = _write_descriptor->file_handle.tell();
     if (offset == -1) {
         res = OLAP_ERR_IO_ERROR;
-        goto ADD_ROW_BLOCK_ERROR;
+        _check_io_error(res);
+        return res;
     }
 
     (end_data_offset == NULL || (*end_data_offset = static_cast(offset)));
 
     return OLAP_SUCCESS;
-
-ADD_ROW_BLOCK_ERROR:
-    _check_io_error(res);
-
-    return res;
 }
 
 OLAPStatus OLAPData::finalize_segment(uint32_t* data_offset) {
@@ -823,6 +738,57 @@ OLAPStatus OLAPData::set_end_key(const RowCursor* end_key, bool find_last_end_ke
     return OLAP_SUCCESS;
 }
 
+OLAPStatus OLAPData::prepare_block_read(
+        const RowCursor* start_key, bool find_start_key,
+        const RowCursor* end_key, bool find_end_key,
+        RowBlock** row_block) {
+    if (end_key != nullptr) {
+        auto res = set_end_key(end_key, find_end_key);
+        if (res != OLAP_SUCCESS) {
+            // Just ignore this error
+            VLOG(1) << "can't find end_key, end_key:" << end_key->to_string();
+        }
+    }
+    if (start_key != nullptr) {
+        auto row = find_row(*start_key, find_start_key, false);
+        if (row == nullptr) {
+            if (!eof()) {
+                // Some error happened
+                LOG(WARNING) << "failed to find start row row";
+                return OLAP_ERR_INIT_FAILED;
+            }
+            *row_block = nullptr;
+            return OLAP_ERR_DATA_EOF;
+        }
+    } else {
+        auto row = get_first_row();
+        if (row == nullptr) {
+            if (!eof()) {
+                LOG(WARNING) << "failed to get first row";
+                return OLAP_ERR_INIT_FAILED;
+            }
+            *row_block = nullptr;
+            return OLAP_ERR_DATA_EOF;
+        }
+    }
+    *row_block = _row_block_broker->get_row_block_to_read();
+    return OLAP_SUCCESS;
+}
+
+OLAPStatus OLAPData::get_next_block(RowBlock** block) {
+    auto res = get_next_row_block(block, nullptr, nullptr);
+    if (eof()) {
+        *block = nullptr;
+        return OLAP_ERR_DATA_EOF;
+    }
+    if (res != OLAP_SUCCESS) {
+        LOG(WARNING) << "failed to get_next_row_block, res:" << res;
+        return res;
+    }
+    *block = _row_block_broker->get_row_block_to_read();
+    return OLAP_SUCCESS;
+}
+
 void OLAPData::_check_io_error(OLAPStatus res) {
     if (is_io_error(res)) {
         _olap_table->set_io_error();
@@ -845,8 +811,6 @@ OLAPData::RowBlockBroker::RowBlockBroker(
         _is_set_end_row(false),
         _olap_table(olap_table),
         _olap_index(olap_index),
-        _query_conjunct_ctxs(NULL),
-        _delete_conjunct_ctxs(NULL),
         _is_end_block(false),
         _runtime_state(runtime_state) {
     if (_olap_index != NULL) {
@@ -897,30 +861,13 @@ OLAPStatus OLAPData::RowBlockBroker::init() {
 
 const RowCursor* OLAPData::RowBlockBroker::first() {
     _row_index = 0;
-    if (_row_block->get_row_to_read(_row_index, &_row_cursor) != OLAP_SUCCESS) {
-        OLAP_LOG_WARNING("fail to get row from row block. "
-                         "[segment=%d data_offset=%d _row_index=%d]",
-                         _row_block_pos.segment,
-                         _row_block_pos.data_offset,
-                         _row_index);
-        return NULL;
-    }
-
+    _row_block->get_row(_row_index, &_row_cursor);
     return &_row_cursor;
 }
 
 const RowCursor* OLAPData::RowBlockBroker::last() {
     _row_index = _num_rows - 1;
-
-    if (_row_block->get_row_to_read(_row_index, &_row_cursor) != OLAP_SUCCESS) {
-        OLAP_LOG_WARNING("fail to get row from row block. "
-                         "[segment=%d data_offset=%d _row_index=%d]",
-                         _row_block_pos.segment,
-                         _row_block_pos.data_offset,
-                         _row_index);
-        return NULL;
-    }
-
+    _row_block->get_row(_row_index, &_row_cursor);
     return &_row_cursor;
 }
 
@@ -932,14 +879,7 @@ const RowCursor* OLAPData::RowBlockBroker::next(bool* end_of_row_block) {
 
     (end_of_row_block == NULL || (*end_of_row_block = false));
 
-    if (_row_block->get_row_to_read(_row_index, &_row_cursor) != OLAP_SUCCESS) {
-        OLAP_LOG_WARNING("fail to get row from row block. "
-                         "[segment=%d data_offset=%d _row_index=%d]",
-                         _row_block_pos.segment,
-                         _row_block_pos.data_offset,
-                         _row_index);
-        return NULL;
-    }
+    _row_block->get_row(_row_index, &_row_cursor);
 
     return &_row_cursor;
 }
@@ -957,14 +897,7 @@ const RowCursor* OLAPData::RowBlockBroker::find_row(const RowCursor& key,
         return NULL;
     }
 
-    if (_row_block->get_row_to_read(_row_index, &_row_cursor) != OLAP_SUCCESS) {
-        OLAP_LOG_WARNING("fail to get row from row block. "
-                         "[segment=%d, data_offset=%d, _row_index=%d]",
-                         _row_block_pos.segment,
-                         _row_block_pos.data_offset,
-                         _row_index);
-        return NULL;
-    }
+    _row_block->get_row(_row_index, &_row_cursor);
 
     (end_of_row_block == NULL || (*end_of_row_block = false));
     
@@ -976,16 +909,7 @@ const RowCursor* OLAPData::RowBlockBroker::get_row(uint32_t row_index) {
     if (_row_index >= _num_rows) {
         return NULL;
     }
-
-    if (_row_block->get_row_to_read(_row_index, &_row_cursor) != OLAP_SUCCESS) {
-        OLAP_LOG_WARNING("fail to get row from row block."
-                         "[segment=%d, data_offset=%d, _row_index=%d]",
-                         _row_block_pos.segment,
-                         _row_block_pos.data_offset,
-                         _row_index);
-        return NULL;
-    }
-
+    _row_block->get_row(_row_index, &_row_cursor);
     return &_row_cursor;
 }
 
@@ -1002,14 +926,8 @@ const RowCursor* OLAPData::RowBlockBroker::current() {
     return &_row_cursor;
 }
 
-OLAPStatus OLAPData::RowBlockBroker::change_to(
-        const RowBlockPosition& row_block_pos, RuntimeProfile* profile) {
+OLAPStatus OLAPData::RowBlockBroker::change_to(const RowBlockPosition& row_block_pos) {
     OLAPStatus res = OLAP_SUCCESS;
-    RuntimeProfile::Counter* read_data_timer = NULL;
-    if (profile != NULL) {
-        read_data_timer = profile->get_counter("ReadDataTime");    
-    }
-    SCOPED_TIMER(read_data_timer);
 
     // å…ˆå°†æŒæœ‰çš„row_block释放
     this->release();
@@ -1179,26 +1097,6 @@ OLAPStatus OLAPData::RowBlockBroker::_get_row_block(const RowBlockPosition& row_
         goto GET_ROW_BLOCK_ERROR;
     }
 
-    // 过滤删除æ¡ä»¶
-    if (_delete_conjunct_ctxs != NULL) {
-        res = _row_block->eval_conjuncts(*_delete_conjunct_ctxs);
-        if (res != OLAP_SUCCESS) {
-            OLAP_LOG_WARNING("fail to eval delete conjuncts for row block. [res=%d]", res);
-            goto GET_ROW_BLOCK_ERROR;
-        }
-        // ä¿å­˜åˆ é™¤æ¡ä»¶æ‰§è¡ŒåŽçš„结果,以便存入Cache备用
-        _row_block->backup();
-    }
-
-    // 过滤查询æ¡ä»¶
-    if (_query_conjunct_ctxs != NULL) {
-        res = _row_block->eval_conjuncts(*_query_conjunct_ctxs);
-        if (res != OLAP_SUCCESS) {
-            OLAP_LOG_WARNING("fail to eval query conjuncts for row block. [res=%d]", res);
-            goto GET_ROW_BLOCK_ERROR;
-        }
-    }
-
     return OLAP_SUCCESS;
 
 GET_ROW_BLOCK_ERROR:
diff --git a/be/src/olap/olap_data.h b/be/src/olap/olap_data.h
index 9ad732cf95..31a2d12c52 100644
--- a/be/src/olap/olap_data.h
+++ b/be/src/olap/olap_data.h
@@ -54,9 +54,6 @@ public:
     // åˆå§‹åŒ–, å’Œunpickle统一到åŒä¸€æµç¨‹ä¸Š
     virtual OLAPStatus init();
 
-    virtual void set_conjuncts(std::vector* query_conjuncts, 
-                               std::vector* delete_conjuncts);
-
     OLAPStatus get_first_row_block(RowBlock** row_block,
                                const char** packed_row_block,
                                uint32_t* packed_row_block_size);
@@ -79,20 +76,26 @@ public:
 
     // Points the internal cursor to either the first row or the last row.
     // Returns NULL in case of an error.
-    virtual const RowCursor* get_first_row();
-    virtual const RowCursor* get_current_row();
+    const RowCursor* get_first_row();
 
     // Advances the internal cursor to the next row and returns that row.
     // Sets _eof if there is no more row left.
-    virtual const RowCursor* get_next_row();
+    const RowCursor* get_next_row();
 
     // Points internal cursor to the first row equal to or larger than 'key'.
     // to key. Returns a pointer to the row or NULL if 1) there is an
     // error, or 2) the key exceeds any row in the table.
-    virtual const RowCursor* find_row(const RowCursor& key, bool find_last_key, bool is_end_key);
+    const RowCursor* find_row(const RowCursor& key, bool find_last_key, bool is_end_key);
 
     // find_last_end_key false:<; true:<=
-    virtual OLAPStatus set_end_key(const RowCursor* end_key, bool find_last_end_key);
+    OLAPStatus set_end_key(const RowCursor* end_key, bool find_last_end_key);
+
+    OLAPStatus prepare_block_read(
+        const RowCursor* start_key, bool find_start_key,
+        const RowCursor* end_key, bool find_end_key,
+        RowBlock** block) override;
+
+    OLAPStatus get_next_block(RowBlock** block) override;
 
     // The following four functions are used for creating new date
     // files. add_segment() and finalize_segment() start and end a new
@@ -100,13 +103,6 @@ public:
     // add a new data block to the current segment.(only writer)
     OLAPStatus add_segment();
 
-    // Add packed row block into OLAPData.
-    OLAPStatus add_packed_row_block(const RowBlock* row_block,
-                                const char* packed_row_block,
-                                uint32_t packed_row_block_size,
-                                uint32_t* start_data_offset,
-                                uint32_t* end_data_offset);
-
     // TODO(fdy): 未实现方法,等待有使用需求时å†å®žçް
     OLAPStatus add_packed_rowblock(
             const char* packed_row_block, const uint32_t packed_row_block_size);
@@ -119,9 +115,9 @@ public:
     //                                 equals to the current segment file length.
     // @return  OLAPStatus  OLAP_SUCCESS if succeed, or else OLAP_ERR_XXX
     // @note
-    OLAPStatus add_row_block(const RowBlock& row_block,
-                         uint32_t* start_data_offset,
-                         uint32_t* end_data_offset);
+    OLAPStatus add_row_block(RowBlock* row_block,
+                             uint32_t* start_data_offset,
+                             uint32_t* end_data_offset);
 
     // 结æŸsegment,回写头部
     OLAPStatus finalize_segment(uint32_t* data_offset);
@@ -161,7 +157,7 @@ private:
         OLAPStatus init();
 
         // æ ¹æ®block position,在文件中定ä½row_block
-        OLAPStatus change_to(const RowBlockPosition& row_block_pos, RuntimeProfile* profile);
+        OLAPStatus change_to(const RowBlockPosition& row_block_pos);
         // é‡Šæ”¾å½“å‰æŒæœ‰çš„row_block
         OLAPStatus release();
 
@@ -173,12 +169,6 @@ private:
         const RowCursor* next(bool* end_of_row_block);
         const RowCursor* find_row(const RowCursor& key, bool find_last_key, bool* end_of_row_block);
 
-        void set_conjuncts(std::vector* query_conjunct_ctxs, 
-                           std::vector* delete_conjunct_ctxs) {
-            _query_conjunct_ctxs = query_conjunct_ctxs;
-            _delete_conjunct_ctxs = delete_conjunct_ctxs;
-        }
-
         void set_end_row(const RowBlockPosition& end_block_position, uint32_t end_row_index) {
             _end_block_position = end_block_position;
             _end_row_index = end_row_index;
@@ -212,6 +202,12 @@ private:
             return _row_block;
         }
 
+        RowBlock* get_row_block_to_read() {
+            _row_block->set_pos(_row_index);
+            _row_block->set_limit(_num_rows);
+            return _row_block;
+        }
+
         const char* packed_row_block() {
             return _read_buffer + _row_block_header_size;
         }
@@ -224,6 +220,8 @@ private:
             return _is_end_block;
         }
 
+        uint32_t num_rows() const { return _num_rows; }
+
         Tuple* get_next_tuple();
         
     private:
@@ -248,9 +246,6 @@ private:
         OLAPTable* _olap_table;
         OLAPIndex* _olap_index;
 
-        std::vector* _query_conjunct_ctxs;
-        std::vector* _delete_conjunct_ctxs;
-
         uint64_t _data_read_buf_size;
         bool _is_end_block;
         RuntimeState* _runtime_state;
@@ -355,11 +350,7 @@ private:
         // å–block里的最åŽä¸€æ¡æ•°æ®ä¸Žkey进行比较,返回å°äºŽçš„结果
         // TODO(hujie01): 比较block暂时ä¸ä½¿ç”¨è¿‡æ»¤æ¡ä»¶
         uint32_t row_num = block->row_block_info().row_num;
-        if ((res = block->get_row_to_read(row_num - 1, _helper_cursor, true)) 
-                != OLAP_SUCCESS) {
-            OLAP_LOG_FATAL("fail to invoke get_row. [res=%d]", res);
-            throw ComparatorException();
-        }
+        block->get_row(row_num - 1, _helper_cursor);
 
         if (comparator_enum == COMPARATOR_LESS) {
             return _helper_cursor->cmp(key) < 0;
diff --git a/be/src/olap/olap_define.h b/be/src/olap/olap_define.h
index 45e525d459..3d80eb641b 100644
--- a/be/src/olap/olap_define.h
+++ b/be/src/olap/olap_define.h
@@ -46,7 +46,15 @@ static const size_t OLAP_LRU_CACHE_MAX_KEY_LENTH = OLAP_MAX_PATH_LEN * 2;
 
 static const uint64_t OLAP_FIX_HEADER_MAGIC_NUMBER = 0;
 // 执行be/ce时默认的候选集大å°
-static constexpr uint32_t OLAP_EXPANSION_DEFAULT_CANDIDATE_SIZE = 10;
+static constexpr uint32_t OLAP_COMPACTION_DEFAULT_CANDIDATE_SIZE = 10;
+
+// the max length supported for string type
+static const uint16_t OLAP_STRING_MAX_LENGTH = 65535;
+
+// the max bytes for stored string length
+using StringOffsetType = uint32_t;
+using StringLengthType = uint16_t;
+static const uint16_t OLAP_STRING_MAX_BYTES = sizeof(StringLengthType);
 
 enum OLAPDataVersion {
     OLAP_V1 = 0,
@@ -190,7 +198,7 @@ enum OLAPStatus {
     OLAP_ERR_READER_ACQUIRE_DATA_ERROR = -702,
     OLAP_ERR_READER_READING_ERROR = -703,
 
-    // BaseExpansion
+    // BaseCompaction
     // [-800, -900)
     OLAP_ERR_BE_VERSION_NOT_MATCH = -800,
     OLAP_ERR_BE_REPLACE_VERSIONS_ERROR = -801,
@@ -299,6 +307,13 @@ static const char* const HINIS_KEY_SEPARATOR = ";";
 static const char* const HINIS_KEY_PAIR_SEPARATOR = "|";
 static const char* const HINIS_KEY_GROUP_SEPARATOR = "&";
 
+#define RETURN_NOT_OK(s) do { \
+    OLAPStatus _s = (s);      \
+    if (_s != OLAP_SUCCESS) { \
+        return _s; \
+    } \
+} while (0);
+
 // Declare copy constructor and equal operator as private
 #ifndef DISALLOW_COPY_AND_ASSIGN
 #define DISALLOW_COPY_AND_ASSIGN(type_t) \
diff --git a/be/src/olap/olap_engine.cpp b/be/src/olap/olap_engine.cpp
index dbc06be1aa..461f794364 100644
--- a/be/src/olap/olap_engine.cpp
+++ b/be/src/olap/olap_engine.cpp
@@ -28,8 +28,8 @@
 #include 
 #include 
 
-#include "olap/base_expansion_handler.h"
-#include "olap/cumulative_handler.h"
+#include "olap/base_compaction.h"
+#include "olap/cumulative_compaction.h"
 #include "olap/lru_cache.h"
 #include "olap/olap_header.h"
 #include "olap/olap_rootpath.h"
@@ -262,17 +262,17 @@ OLAPStatus OLAPEngine::init() {
     // åˆå§‹åŒ–CE调度器
     vector all_root_paths_stat;
     OLAPRootPath::get_instance()->get_all_disk_stat(&all_root_paths_stat);
-    _ce_disk_stat.reserve(all_root_paths_stat.size());
+    _cumulative_compaction_disk_stat.reserve(all_root_paths_stat.size());
     for (uint32_t i = 0; i < all_root_paths_stat.size(); i++) {
         const OLAPRootPathStat& stat = all_root_paths_stat[i];
-        _ce_disk_stat.emplace_back(stat.root_path, i, stat.is_used);
+        _cumulative_compaction_disk_stat.emplace_back(stat.root_path, i, stat.is_used);
         _disk_id_map[stat.root_path] = i;
     }
-    int32_t ce_thread_num = config::cumulative_thread_num;
-    int32_t be_thread_num = config::base_expansion_thread_num;
+    int32_t cumulative_compaction_num_threads = config::cumulative_compaction_num_threads;
+    int32_t base_compaction_num_threads = config::base_compaction_num_threads;
     uint32_t file_system_num = OLAPRootPath::get_instance()->get_file_system_count();
-    _max_ce_task_per_disk = (ce_thread_num + file_system_num - 1) / file_system_num;
-    _max_be_task_per_disk = (be_thread_num + file_system_num - 1) / file_system_num;
+    _max_cumulative_compaction_task_per_disk = (cumulative_compaction_num_threads + file_system_num - 1) / file_system_num;
+    _max_base_compaction_task_per_disk = (base_compaction_num_threads + file_system_num - 1) / file_system_num;
 
     // 加载所有table
     OLAPRootPath::get_instance()->get_all_available_root_path(&all_available_root_path);
@@ -435,7 +435,7 @@ OLAPStatus OLAPEngine::add_table(TTabletId tablet_id, SchemaHash schema_hash, OL
         smart_table->mark_dropped();
         res = OLAP_ERR_ENGINE_INSERT_EXISTS_TABLE;
     }
-    OLAP_LOG_WARNING("add dumplicated table. [res=%d tablet_id=%ld schema_hash=%d "
+    OLAP_LOG_WARNING("add duplicated table. [res=%d tablet_id=%ld schema_hash=%d "
                      "old_version=%d new_version=%d old_time=%ld new_time=%ld]",
                      res, tablet_id, schema_hash,
                      old_version, new_version, old_time, new_time);
@@ -872,7 +872,7 @@ OLAPStatus OLAPEngine::report_all_tablets_info(
     return OLAP_SUCCESS;
 }
 
-bool OLAPEngine::_can_do_be_ce(SmartOLAPTable table) {
+bool OLAPEngine::_can_do_compaction(SmartOLAPTable table) {
     // 如果table正在åšschema changeï¼Œåˆ™é€šè¿‡é€‰è·¯åˆ¤æ–­æ•°æ®æ˜¯å¦è½¬æ¢å®Œæˆ
     // 如果选路æˆåŠŸï¼Œåˆ™è½¬æ¢å®Œæˆï¼Œå¯ä»¥è¿›è¡ŒBE
     // å¦‚æžœé€‰è·¯å¤±è´¥ï¼Œåˆ™è½¬æ¢æœªå®Œæˆï¼Œä¸èƒ½è¿›è¡ŒBE
@@ -902,89 +902,89 @@ void OLAPEngine::start_clean_fd_cache() {
     OLAP_LOG_TRACE("end clean file descritpor cache");
 }
 
-void OLAPEngine::start_base_expansion(string* last_be_fs, TTabletId* last_be_tablet_id) {
-    uint64_t allow_be_excute_start_time = config::be_policy_start_time;
-    uint64_t allow_be_excute_end_time = config::be_policy_end_time;
+void OLAPEngine::start_base_compaction(string* last_base_compaction_fs, TTabletId* last_base_compaction_tablet_id) {
+    uint64_t base_compaction_start_hour = config::base_compaction_start_hour;
+    uint64_t base_compaction_end_hour = config::base_compaction_end_hour;
     time_t current_time = time(NULL);
     uint64_t current_hour = localtime(¤t_time)->tm_hour;
     // 如果执行BE的时间区间设置为类似以下的形å¼ï¼š[1:00, 8:00)
-    if (allow_be_excute_start_time <= allow_be_excute_end_time) {
-        if (current_hour < allow_be_excute_start_time
-                || current_hour >= allow_be_excute_end_time) {
-            OLAP_LOG_TRACE("don't allow to excute base expansion in this time interval. "
+    if (base_compaction_start_hour <= base_compaction_end_hour) {
+        if (current_hour < base_compaction_start_hour
+                || current_hour >= base_compaction_end_hour) {
+            OLAP_LOG_TRACE("don't allow to excute base compaction in this time interval. "
                            "[now_hour=%d; allow_start_time=%d; allow_end_time=%d]",
                            current_hour,
-                           allow_be_excute_start_time,
-                           allow_be_excute_end_time);
+                           base_compaction_start_hour,
+                           base_compaction_end_hour);
             return;
         }
     } else { // 如果执行BE的时间区间设置为类似以下的形å¼ï¼š[22:00, 8:00)
-        if (current_hour < allow_be_excute_start_time
-                && current_hour >= allow_be_excute_end_time) {
-            OLAP_LOG_TRACE("don't allow to excute base expansion in this time interval. "
+        if (current_hour < base_compaction_start_hour
+                && current_hour >= base_compaction_end_hour) {
+            OLAP_LOG_TRACE("don't allow to excute base compaction in this time interval. "
                            "[now_hour=%d; allow_start_time=%d; allow_end_time=%d]",
                            current_hour,
-                           allow_be_excute_start_time,
-                           allow_be_excute_end_time);
+                           base_compaction_start_hour,
+                           base_compaction_end_hour);
             return;
         }
     }
 
     SmartOLAPTable tablet;
-    BaseExpansionHandler base_expansion_handler;
+    BaseCompaction base_compaction;
 
-    bool do_base_expansion = false;
-    OLAP_LOG_TRACE("start_base_expansion begin.");
+    bool do_base_compaction = false;
+    OLAP_LOG_TRACE("start_base_compaction begin.");
     _tablet_map_lock.rdlock();
     _fs_task_mutex.lock();
 
-    if (*last_be_fs != "") {
-        _fs_be_task_num_map[*last_be_fs] -= 1;
-        last_be_fs->clear();
+    if (*last_base_compaction_fs != "") {
+        _fs_base_compaction_task_num_map[*last_base_compaction_fs] -= 1;
+        last_base_compaction_fs->clear();
     }
 
     for (const auto& i : _tablet_map) {
         for (SmartOLAPTable j : i.second.table_arr) {
             // ä¿è¯ä»Žä¸Šä¸€æ¬¡è¢«é€‰ä¸­è¿›è¡ŒBE的表开始轮询
-            if (i.first <= *last_be_tablet_id) {
+            if (i.first <= *last_base_compaction_tablet_id) {
                 continue;
             }
 
-            if (_fs_be_task_num_map[j->storage_root_path_name()] >= _max_be_task_per_disk) {
+            if (_fs_base_compaction_task_num_map[j->storage_root_path_name()] >= _max_base_compaction_task_per_disk) {
                 continue;
             }
 
             // 跳过正在åšschema changeçš„tablet
-            if (!_can_do_be_ce(j)) {
+            if (!_can_do_compaction(j)) {
                 OLAP_LOG_DEBUG("skip tablet, it is schema changing. [tablet=%s]",
                                j->full_name().c_str());
                 continue;
             }
 
-            if (base_expansion_handler.init(j, false) == OLAP_SUCCESS) {
+            if (base_compaction.init(j, false) == OLAP_SUCCESS) {
                 tablet = j;
-                do_base_expansion = true;
-                _fs_be_task_num_map[tablet->storage_root_path_name()] += 1;
-                *last_be_fs = tablet->storage_root_path_name();
-                *last_be_tablet_id = i.first;
+                do_base_compaction = true;
+                _fs_base_compaction_task_num_map[tablet->storage_root_path_name()] += 1;
+                *last_base_compaction_fs = tablet->storage_root_path_name();
+                *last_base_compaction_tablet_id = i.first;
                 goto TRY_START_BE_OK;
             }
         }
     }
 
     // when the loop comes the end, restart from begin
-    *last_be_tablet_id = -1;
+    *last_base_compaction_tablet_id = -1;
 
 TRY_START_BE_OK:
     _fs_task_mutex.unlock();
     _tablet_map_lock.unlock();
-    OLAP_LOG_TRACE("start_base_expansion end.");
+    OLAP_LOG_TRACE("start_base_compaction end.");
 
-    if (do_base_expansion) {
-        OLAP_LOG_NOTICE_PUSH("request", "START_BASE_EXPANSION");
-        OLAPStatus cmd_res = base_expansion_handler.run();
+    if (do_base_compaction) {
+        OLAP_LOG_NOTICE_PUSH("request", "START_BASE_COMPACTION");
+        OLAPStatus cmd_res = base_compaction.run();
         if (cmd_res != OLAP_SUCCESS) {
-            OLAP_LOG_WARNING("failed to do base expansion. [tablet='%s']",
+            OLAP_LOG_WARNING("failed to do base compaction. [tablet='%s']",
                              tablet->full_name().c_str());
         }
     }
@@ -993,9 +993,9 @@ TRY_START_BE_OK:
 void OLAPEngine::_select_candidate() {
     // è¿™æ˜¯ä¸€ä¸ªå°æ ¹å †ï¼Œç”¨äºŽè®°å½•nice最大的top k个candidate tablet
     SmartOLAPTable tablet;
-    typedef priority_queue,
-            ExpansionCandidateComparator> candidate_heap_t;
-    vector candidate_heap_vec(_ce_disk_stat.size());
+    typedef priority_queue,
+            CompactionCandidateComparator> candidate_heap_t;
+    vector candidate_heap_vec(_cumulative_compaction_disk_stat.size());
     for (const auto& i : _tablet_map) {
         uint32_t nice = 0;
         // calc nice
@@ -1005,7 +1005,7 @@ void OLAPEngine::_select_candidate() {
             }
 
             j->obtain_header_rdlock();
-            const uint32_t curr_nice = j->get_expansion_nice_estimate();
+            const uint32_t curr_nice = j->get_compaction_nice_estimate();
             j->release_header_lock();
             nice = curr_nice > nice ? curr_nice : nice;
             tablet = j;
@@ -1015,27 +1015,27 @@ void OLAPEngine::_select_candidate() {
         if (nice > 0) {
             uint32_t disk_id = _disk_id_map[tablet->storage_root_path_name()];
             candidate_heap_vec[disk_id].emplace(nice, i.first, disk_id);
-            if (candidate_heap_vec[disk_id].size() > OLAP_EXPANSION_DEFAULT_CANDIDATE_SIZE) {
+            if (candidate_heap_vec[disk_id].size() > OLAP_COMPACTION_DEFAULT_CANDIDATE_SIZE) {
                 candidate_heap_vec[disk_id].pop();
             }
         }
     }
 
-    _ce_candidate.clear();
-    for (auto& stat : _ce_disk_stat) {
+    _cumulative_compaction_candidate.clear();
+    for (auto& stat : _cumulative_compaction_disk_stat) {
         stat.task_remaining = 0;
     }
 
     for (auto& candidate_heap : candidate_heap_vec) {
         while (!candidate_heap.empty()) {
-            _ce_candidate.push_back(candidate_heap.top());
-            ++_ce_disk_stat[candidate_heap.top().disk_index].task_remaining;
+            _cumulative_compaction_candidate.push_back(candidate_heap.top());
+            ++_cumulative_compaction_disk_stat[candidate_heap.top().disk_index].task_remaining;
             candidate_heap.pop();
         }
     }
 
     // sort small to big
-    sort(_ce_candidate.rbegin(), _ce_candidate.rend(), ExpansionCandidateComparator());
+    sort(_cumulative_compaction_candidate.rbegin(), _cumulative_compaction_candidate.rend(), CompactionCandidateComparator());
 }
 
 void OLAPEngine::start_cumulative_priority() {
@@ -1048,10 +1048,10 @@ void OLAPEngine::start_cumulative_priority() {
     OLAPRootPath::get_instance()->get_all_disk_stat(&all_root_paths_stat);
     for (uint32_t i = 0; i < all_root_paths_stat.size(); i++) {
         uint32_t disk_id = _disk_id_map[all_root_paths_stat[i].root_path];
-        _ce_disk_stat[disk_id].is_used = all_root_paths_stat[i].is_used;
+        _cumulative_compaction_disk_stat[disk_id].is_used = all_root_paths_stat[i].is_used;
     }
 
-    for (auto& disk : _ce_disk_stat) {
+    for (auto& disk : _cumulative_compaction_disk_stat) {
         if (!disk.task_remaining && disk.is_used) {
             is_select = true;
         }
@@ -1061,58 +1061,58 @@ void OLAPEngine::start_cumulative_priority() {
         _select_candidate();
     }
 
-    // traverse _ce_candidate to start cumulative expansion
-    CumulativeHandler cumulative_handler;
-    for (auto it_cand = _ce_candidate.rbegin(); it_cand != _ce_candidate.rend(); ++it_cand) {
-        ExpansionCandidate candidate = *it_cand;
+    // traverse _cumulative_compaction_candidate to start cumulative compaction
+    CumulativeCompaction cumulative_compaction;
+    for (auto it_cand = _cumulative_compaction_candidate.rbegin(); it_cand != _cumulative_compaction_candidate.rend(); ++it_cand) {
+        CompactionCandidate candidate = *it_cand;
         const auto i = _tablet_map.find(candidate.tablet_id);
         if (i == _tablet_map.end()) {
             // tabletå·²ç»ä¸å­˜åœ¨
-            _ce_candidate.erase(it_cand.base() - 1);
-            --_ce_disk_stat[candidate.disk_index].task_remaining;
+            _cumulative_compaction_candidate.erase(it_cand.base() - 1);
+            --_cumulative_compaction_disk_stat[candidate.disk_index].task_remaining;
             continue;
         }
 
-        if (_ce_disk_stat[candidate.disk_index].task_running >= _max_ce_task_per_disk) {
+        if (_cumulative_compaction_disk_stat[candidate.disk_index].task_running >= _max_cumulative_compaction_task_per_disk) {
             OLAP_LOG_DEBUG("skip tablet, too much ce task on disk %s",
-                    _ce_disk_stat[candidate.disk_index].storage_path.c_str());
+                    _cumulative_compaction_disk_stat[candidate.disk_index].storage_path.c_str());
             // æŸä¸ªdisk上任务数太多,跳过,candidate中ä¿ç•™è¿™ä¸ªä»»åŠ¡
             continue;
         }
 
         for (SmartOLAPTable j : i->second.table_arr) {
-            if (!_can_do_be_ce(j)) {
+            if (!_can_do_compaction(j)) {
                 OLAP_LOG_DEBUG("skip tablet, it is schema changing. [tablet=%s]",
                                j->full_name().c_str());
                 continue;
             }
 
-            if (cumulative_handler.init(j) == OLAP_SUCCESS) {
-                _ce_candidate.erase(it_cand.base() - 1);
-                --_ce_disk_stat[candidate.disk_index].task_remaining;
-                ++_ce_disk_stat[candidate.disk_index].task_running;
+            if (cumulative_compaction.init(j) == OLAP_SUCCESS) {
+                _cumulative_compaction_candidate.erase(it_cand.base() - 1);
+                --_cumulative_compaction_disk_stat[candidate.disk_index].task_remaining;
+                ++_cumulative_compaction_disk_stat[candidate.disk_index].task_running;
                 _fs_task_mutex.unlock();
                 _tablet_map_lock.unlock();
 
                 // start cumulative
-                if (cumulative_handler.run() != OLAP_SUCCESS) {
+                if (cumulative_compaction.run() != OLAP_SUCCESS) {
                     OLAP_LOG_WARNING("failed to do cumulative. [tablet='%s']",
                                      j->full_name().c_str());
                 }
 
                 _fs_task_mutex.lock();
-                --_ce_disk_stat[candidate.disk_index].task_running;
+                --_cumulative_compaction_disk_stat[candidate.disk_index].task_running;
                 _fs_task_mutex.unlock();
                 return;
             }
         }
         // 这个tabletä¸é€‚åˆåšce
-        _ce_candidate.erase(it_cand.base() - 1);
-        --_ce_disk_stat[candidate.disk_index].task_remaining;
+        _cumulative_compaction_candidate.erase(it_cand.base() - 1);
+        --_cumulative_compaction_disk_stat[candidate.disk_index].task_remaining;
     }
     _fs_task_mutex.unlock();
     _tablet_map_lock.unlock();
-    OLAP_LOG_TRACE("no tablet selected to do cumulative expansion this loop.");
+    OLAP_LOG_TRACE("no tablet selected to do cumulative compaction this loop.");
 }
 
 void OLAPEngine::get_cache_status(rapidjson::Document* document) const {
@@ -1289,8 +1289,8 @@ OLAPStatus OLAPEngine::_create_new_table_header_file(
         if (true == is_schema_change_table) {
             /*
              * schema change的old_olap_table和new_olap_table的schema进行比较
-             * 1. 新表的列å在旧表中存在,则新版相应列的unique_idå¤ç”¨æ—§è¡¨åˆ—çš„unique_id 
-             * 2. 新表的列å在旧表中ä¸å­˜åœ¨ï¼Œåˆ™æ–°ç‰ˆç›¸åº”列的unique_id设为旧表列的next_unique_id
+             * 1. 新表的列å在旧表中存在,则新表相应列的unique_idå¤ç”¨æ—§è¡¨åˆ—çš„unique_id 
+             * 2. 新表的列å在旧表中ä¸å­˜åœ¨ï¼Œåˆ™æ–°è¡¨ç›¸åº”列的unique_id设为旧表列的next_unique_id
              *    
             */
             size_t field_num = ref_olap_table->tablet_schema().size();
diff --git a/be/src/olap/olap_engine.h b/be/src/olap/olap_engine.h
index 7b4a01fa27..66faba0dc9 100644
--- a/be/src/olap/olap_engine.h
+++ b/be/src/olap/olap_engine.h
@@ -110,7 +110,7 @@ public:
     OLAPStatus clear();
 
     void start_clean_fd_cache();
-    void start_base_expansion(std::string* last_be_fs, TTabletId* last_be_tablet_id);
+    void start_base_compaction(std::string* last_base_compaction_fs, TTabletId* last_base_compaction_tablet_id);
 
     // 调度ce,优先级调度
     void start_cumulative_priority();
@@ -143,22 +143,22 @@ private:
         std::list table_arr;
     };
 
-    struct ExpansionCandidate {
-        ExpansionCandidate(uint32_t nice_, int64_t tablet_id_, uint32_t index_) :
-                nice(nice_), tablet_id(tablet_id_), disk_index(index_) {}
+    struct CompactionCandidate {
+        CompactionCandidate(uint32_t nicumulative_compaction_, int64_t tablet_id_, uint32_t index_) :
+                nice(nicumulative_compaction_), tablet_id(tablet_id_), disk_index(index_) {}
         uint32_t nice; // 优先度
         int64_t tablet_id;
         uint32_t disk_index = -1;
     };
 
-    struct ExpansionCandidateComparator {
-        bool operator()(const ExpansionCandidate& a, const ExpansionCandidate& b) {
+    struct CompactionCandidateComparator {
+        bool operator()(const CompactionCandidate& a, const CompactionCandidate& b) {
             return a.nice > b.nice;
         }
     };
 
-    struct ExpansionDiskStat {
-        ExpansionDiskStat(std::string path, uint32_t index, bool used) :
+    struct CompactionDiskStat {
+        CompactionDiskStat(std::string path, uint32_t index, bool used) :
                 storage_path(path),
                 disk_index(index),
                 task_running(0),
@@ -192,7 +192,7 @@ private:
 
     OLAPStatus _check_existed_or_else_create_dir(const std::string& path);
 
-    bool _can_do_be_ce(SmartOLAPTable table);
+    bool _can_do_compaction(SmartOLAPTable table);
 
     void _select_candidate();
 
@@ -208,13 +208,13 @@ private:
     size_t _global_table_id;
     Cache* _file_descriptor_lru_cache;
     Cache* _index_stream_lru_cache;
-    uint32_t _max_be_task_per_disk;
-    uint32_t _max_ce_task_per_disk;
+    uint32_t _max_base_compaction_task_per_disk;
+    uint32_t _max_cumulative_compaction_task_per_disk;
 
     MutexLock _fs_task_mutex;
-    file_system_task_count_t _fs_be_task_num_map;
-    std::vector _ce_candidate;
-    std::vector _ce_disk_stat;
+    file_system_task_count_t _fs_base_compaction_task_num_map;
+    std::vector _cumulative_compaction_candidate;
+    std::vector _cumulative_compaction_disk_stat;
     std::map _disk_id_map;
 
     DISALLOW_COPY_AND_ASSIGN(OLAPEngine);
diff --git a/be/src/olap/olap_header.cpp b/be/src/olap/olap_header.cpp
index 18ebcc5d63..8086f56e42 100644
--- a/be/src/olap/olap_header.cpp
+++ b/be/src/olap/olap_header.cpp
@@ -24,6 +24,7 @@
 #include 
 
 #include "olap/field.h"
+#include "olap/wrapper_field.h"
 #include "olap/file_helper.h"
 #include "olap/utils.h"
 
@@ -148,7 +149,7 @@ OLAPStatus OLAPHeader::add_version(
         int64_t index_size,
         int64_t data_size,
         int64_t num_rows,
-        std::vector > *column_statistics) {
+        const std::vector>* column_statistics) {
     // Check whether version is valid.
     if (version.first > version.second) {
         OLAP_LOG_WARNING("the version is not valid. [version='%d,%d']",
@@ -204,18 +205,6 @@ OLAPStatus OLAPHeader::add_version(
     return OLAP_SUCCESS;
 }
 
-OLAPStatus OLAPHeader::add_version(
-        Version version,
-        VersionHash version_hash,
-        uint32_t num_segments,
-        time_t max_timestamp,
-        int64_t index_size,
-        int64_t data_size,
-        int64_t num_rows) {
-    return add_version(version, version_hash, num_segments, 
-            max_timestamp, index_size, data_size, num_rows, NULL);
-}
-
 OLAPStatus OLAPHeader::delete_version(Version version) {
     // Find the version that need to be deleted.
     int index = -1;
@@ -265,7 +254,7 @@ OLAPStatus OLAPHeader::delete_all_versions() {
     return OLAP_SUCCESS;
 }
 
-// This function is called when base-expansion, cumulative-expansion, quering.
+// This function is called when base-compaction, cumulative-compaction, quering.
 // we use BFS algorithm to get the shortest version path.
 OLAPStatus OLAPHeader::select_versions_to_span(const Version& target_version,
                                            vector* span_versions) {
@@ -438,7 +427,7 @@ const FileVersionMessage* OLAPHeader::get_latest_version() const {
     return max_version;
 }
 
-const uint32_t OLAPHeader::get_expansion_nice_estimate() const{
+const uint32_t OLAPHeader::get_compaction_nice_estimate() const{
     uint32_t nice = 0;
     bool base_version_exists = false;
     const int32_t point = cumulative_layer_point();
@@ -450,7 +439,7 @@ const uint32_t OLAPHeader::get_expansion_nice_estimate() const{
             base_version_exists = true;
         }
     }
-    nice = nice < config::ce_policy_delta_files_number ? 0 : nice;
+    nice = nice < config::cumulative_compaction_num_singleton_deltas ? 0 : nice;
 
     // baseä¸å­˜åœ¨å¯èƒ½æ˜¯tablet正在åšalter table,先ä¸é€‰å®ƒï¼Œè®¾nice=0
     return base_version_exists ? nice : 0;
diff --git a/be/src/olap/olap_header.h b/be/src/olap/olap_header.h
index 2a99cbc02a..7bc732f88b 100644
--- a/be/src/olap/olap_header.h
+++ b/be/src/olap/olap_header.h
@@ -51,14 +51,6 @@ public:
 
     // Adds a new version to the header. Do not use the proto's
     // add_version() directly.
-    OLAPStatus add_version(Version version,
-            VersionHash version_hash,
-            uint32_t num_segments,
-            time_t max_timestamp,
-            int64_t index_size,
-            int64_t data_size,
-            int64_t num_rows);
-
     OLAPStatus add_version(
         Version version,
         VersionHash version_hash,
@@ -67,7 +59,7 @@ public:
         int64_t index_size,
         int64_t data_size,
         int64_t num_rows,
-        std::vector > *column_statistics);
+        const std::vector>* column_statistics = nullptr);
 
     // Deletes a version from the header.
     OLAPStatus delete_version(Version version);
@@ -94,7 +86,7 @@ public:
 
     const FileVersionMessage* get_lastest_delta_version() const;
     const FileVersionMessage* get_latest_version() const;
-    const uint32_t get_expansion_nice_estimate() const;
+    const uint32_t get_compaction_nice_estimate() const;
     const OLAPStatus version_creation_time(const Version& version, int64_t* creation_time) const;
 
 private:
diff --git a/be/src/olap/olap_index.cpp b/be/src/olap/olap_index.cpp
index 79fbfcd13d..0f9803d769 100644
--- a/be/src/olap/olap_index.cpp
+++ b/be/src/olap/olap_index.cpp
@@ -25,6 +25,7 @@
 #include "olap/row_block.h"
 #include "olap/row_cursor.h"
 #include "olap/utils.h"
+#include "olap/wrapper_field.h"
 
 using std::ifstream;
 using std::string;
@@ -71,15 +72,22 @@ OLAPIndex::OLAPIndex(OLAPTable* table,
         _version_hash(version_hash),
         _current_num_rows_per_row_block(0),
         _inited_column_statistics(false),
-        _column_statistics(_table->num_key_fields(), std::pair(NULL, NULL)) {
+        _column_statistics(
+            _table->num_key_fields(), std::pair(NULL, NULL)) {
     const RowFields& tablet_schema = _table->tablet_schema();
     _short_key_length = 0;
+    _new_short_key_length = 0;
     _short_key_buf = NULL;
 
-    //_short_key_length += (_table->num_short_key_fields() + 7) >> 3;
     for (size_t i = 0; i < _table->num_short_key_fields(); ++i) {
         _short_key_info_list.push_back(tablet_schema[i]);
-        _short_key_length += tablet_schema[i].index_length + sizeof(bool);
+        _short_key_length += tablet_schema[i].index_length + 1;// 1 for null byte
+        if (tablet_schema[i].type == OLAP_FIELD_TYPE_CHAR ||
+            tablet_schema[i].type == OLAP_FIELD_TYPE_VARCHAR) {
+            _new_short_key_length += sizeof(StringSlice) + 1;
+        } else {
+            _new_short_key_length += tablet_schema[i].index_length + 1;
+        }
     }
 
     _index_loaded = false;
@@ -135,7 +143,7 @@ void OLAPIndex::delete_all_files() {
 }
 
 OLAPStatus OLAPIndex::set_column_statistics(
-        std::vector > &column_statistics) {
+        const std::vector>& column_statistics) {
     if (_inited_column_statistics) {
         return OLAP_SUCCESS;
     }
@@ -147,25 +155,17 @@ OLAPStatus OLAPIndex::set_column_statistics(
     }
 
     for (size_t i = 0; i < _column_statistics.size(); ++i) {
-        _column_statistics[i].first = Field::create(_table->tablet_schema()[i]);
+        _column_statistics[i].first = WrapperField::create(_table->tablet_schema()[i]);
         if (_column_statistics[i].first == NULL) {
             OLAP_LOG_FATAL("fail to create column statistics field. [field_id=%lu]", i);
             return OLAP_ERR_MALLOC_ERROR;
         }
-        if (!_column_statistics[i].first->allocate()) {
-            OLAP_LOG_FATAL("fail to allocate column statistics field. [field_id=%lu]", i);
-            return OLAP_ERR_MALLOC_ERROR;
-        }
         
-        _column_statistics[i].second = Field::create(_table->tablet_schema()[i]);
+        _column_statistics[i].second = WrapperField::create(_table->tablet_schema()[i]);
         if (_column_statistics[i].second == NULL) {
             OLAP_LOG_FATAL("fail to create column statistics field. [field_id=%lu]", i);
             return OLAP_ERR_MALLOC_ERROR;
         }
-        if (!_column_statistics[i].second->allocate()) {
-            OLAP_LOG_FATAL("fail to allocate column statistics field. [field_id=%lu]", i);
-            return OLAP_ERR_MALLOC_ERROR;
-        }
     }
 
     for (size_t i = 0; i < _column_statistics.size(); ++i) {
@@ -192,25 +192,17 @@ OLAPStatus OLAPIndex::set_column_statistics_from_string(
     }
 
     for (size_t i = 0; i < _column_statistics.size(); ++i) {
-        _column_statistics[i].first = Field::create(_table->tablet_schema()[i]);
+        _column_statistics[i].first = WrapperField::create(_table->tablet_schema()[i]);
         if (_column_statistics[i].first == NULL) {
             OLAP_LOG_FATAL("fail to create column statistics field. [field_id=%lu]", i);
             return OLAP_ERR_MALLOC_ERROR;
         }
-        if (!_column_statistics[i].first->allocate()) {
-            OLAP_LOG_FATAL("fail to allocate column statistics field. [field_id=%lu]", i);
-            return OLAP_ERR_MALLOC_ERROR;
-        }
         
-        _column_statistics[i].second = Field::create(_table->tablet_schema()[i]);
+        _column_statistics[i].second = WrapperField::create(_table->tablet_schema()[i]);
         if (_column_statistics[i].second == NULL) {
             OLAP_LOG_FATAL("fail to create column statistics field. [field_id=%lu]", i);
             return OLAP_ERR_MALLOC_ERROR;
         }
-        if (!_column_statistics[i].second->allocate()) {
-            OLAP_LOG_FATAL("fail to allocate column statistics field. [field_id=%lu]", i);
-            return OLAP_ERR_MALLOC_ERROR;
-        }
     }
 
     OLAPStatus res = OLAP_SUCCESS;
@@ -253,8 +245,8 @@ OLAPStatus OLAPIndex::load() {
         return res;
     }
 
-    if (_index.init(_short_key_length, _table->num_short_key_fields(), 
-            &_short_key_info_list) != OLAP_SUCCESS) {
+    if (_index.init(_short_key_length, _new_short_key_length,
+                    _table->num_short_key_fields(), &_short_key_info_list) != OLAP_SUCCESS) {
         OLAP_LOG_WARNING("fail to create MemIndex. [num_segment=%d]", _num_segments);
         return res;
     }
@@ -389,7 +381,7 @@ OLAPStatus OLAPIndex::find_short_key(const RowCursor& key,
     return _index.get_row_block_position(offset, pos);
 }
 
-OLAPStatus OLAPIndex::get_row_block_entry(const RowBlockPosition& pos, Slice* entry) const {
+OLAPStatus OLAPIndex::get_row_block_entry(const RowBlockPosition& pos, EntrySlice* entry) const {
     TABLE_PARAM_VALIDATE();
     SLICE_PARAM_VALIDATE(entry);
     
@@ -536,11 +528,7 @@ OLAPStatus OLAPIndex::add_segment() {
 
 OLAPStatus OLAPIndex::add_row_block(const RowBlock& row_block, const uint32_t data_offset) {
     // get first row of the row_block to distill index item.
-    if (row_block.get_row_to_read(0, &_current_index_row) != OLAP_SUCCESS) {
-        OLAP_LOG_WARNING("get first row in row_block fail.");
-        return OLAP_ERR_ROWBLOCK_FIND_ROW_EXCEPTION;
-    }
-
+    row_block.get_row(0, &_current_index_row);
     return add_short_key(_current_index_row, data_offset);
 }
 
@@ -553,7 +541,7 @@ OLAPStatus OLAPIndex::add_short_key(const RowCursor& short_key, const uint32_t d
     //offset += short_key.get_num_null_byte();
     for (size_t i = 0; i < _short_key_info_list.size(); i++) {
         short_key.write_index_by_index(i, _short_key_buf + offset);
-        offset += short_key.get_field_size(i);
+        offset += short_key.get_index_size(i) + 1;
     }
 
     // 写入Short Key对应的数æ®
@@ -634,6 +622,16 @@ uint64_t OLAPIndex::num_index_entries() const {
     return _index.count();
 }
 
+MemIndex::MemIndex()
+    : _key_length(0),
+      _num_entries(0),
+      _index_size(0),
+      _data_size(0),
+      _num_rows(0) {
+    _tracker.reset(new MemTracker(-1));
+    _mem_pool.reset(new MemPool(_tracker.get()));
+}
+
 MemIndex::~MemIndex() {
     _num_entries = 0;
     for (vector::iterator it = _meta.begin(); it != _meta.end(); ++it) {
@@ -673,40 +671,60 @@ OLAPStatus MemIndex::load_segment(const char* file, size_t *current_num_rows_per
 
     // å…许索引内容为空
     // 索引长度必须为索引项长度的整数å€
-    meta.buffer.length = meta.file_header.file_length() - meta.file_header.size();
+    size_t storage_length = meta.file_header.file_length() - meta.file_header.size();
     bool null_supported = false;
     //null_supportedæ˜¯ä¸ºäº†å…¼å®¹ä¹‹å‰æ²¡æœ‰NULL字节的数æ®ã€‚
     //ç›®å‰ç´¢å¼•里é¢éƒ½åŠ å…¥äº†NULL的标志ä½ï¼Œentry length都算了NULLæ ‡å¿—ä½æž„æˆçš„bytes
     //对于没有标志ä½çš„ç´¢å¼•ï¼Œè¯»å–æ•°æ®ä¹‹åŽéœ€è¦å¯¹æ¯ä¸ªå­—段补é½è¿™éƒ¨åˆ†ã€‚
-    if (false == meta.file_header.message().has_null_supported()) {
+    if (!meta.file_header.message().has_null_supported()) {
         null_supported = false;
     } else {
         null_supported = meta.file_header.message().null_supported();
     }
     size_t num_short_key_fields = short_key_num();
     bool is_align = false;
-    if (false == null_supported) {
-        is_align = (0 == meta.buffer.length % (entry_length() - num_short_key_fields));
+    if (!null_supported) {
+        is_align = (0 == storage_length % (entry_length() - num_short_key_fields));
     } else {
-        is_align = (0 == meta.buffer.length % entry_length());
+        is_align = (0 == storage_length % entry_length());
     }
-    if (false == is_align) {
+    if (!is_align) {
         res = OLAP_ERR_INDEX_LOAD_ERROR;
         OLAP_LOG_WARNING("fail to load_segment, buffer length is not correct.");
         OLAP_LOG_WARNING("load segment for loading index error. [file=%s; res=%d]", file, res);
         file_handler.close();
         return res;
     }
-    if (false == null_supported) {
-        num_entries = meta.buffer.length / (entry_length() - num_short_key_fields);
-        meta.buffer.data = reinterpret_cast(
-                calloc(meta.buffer.length + num_entries * num_short_key_fields, 1));
+
+    // calculate the total size of all segments
+    if (!null_supported) {
+        _index_size += meta.file_header.file_length() + num_entries * num_short_key_fields;
+        num_entries = storage_length / (entry_length() - num_short_key_fields);
     } else {
-        num_entries = meta.buffer.length / entry_length();
-        meta.buffer.data = reinterpret_cast(calloc(meta.buffer.length, 1));
+        _index_size += meta.file_header.file_length();
+        num_entries = storage_length / entry_length();
+    }
+    _data_size += meta.file_header.extra().data_length;
+    _num_rows += meta.file_header.extra().num_rows;
+
+    meta.range.first = _num_entries;
+    meta.range.last = meta.range.first + num_entries;
+    _num_entries = meta.range.last;
+    _meta.push_back(meta);
+
+    (current_num_rows_per_row_block == NULL
+     || (*current_num_rows_per_row_block = meta.file_header.message().num_rows_per_block()));
+
+    if (OLAP_UNLIKELY(num_entries == 0)) {
+        file_handler.close();
+        return OLAP_SUCCESS;
     }
 
-    if (meta.buffer.data == NULL) {
+    // convert index memory layout for string type
+    // previous layout is size|data,
+    // target type is ptr|size, ptr pointer to data
+    char* storage_data = reinterpret_cast(calloc(storage_length, 1));
+    if (storage_data == nullptr) {
         res = OLAP_ERR_MALLOC_ERROR;
         OLAP_LOG_WARNING("load segment for loading index error. [file=%s; res=%d]", file, res);
         file_handler.close();
@@ -715,78 +733,139 @@ OLAPStatus MemIndex::load_segment(const char* file, size_t *current_num_rows_per
 
     // 读å–索引内容
     // 为了å¯åŠ¨åŠ é€Ÿï¼Œæ­¤å¤„å¯ä½¿ç”¨mmapæ–¹å¼ã€‚
-    if (file_handler.pread(meta.buffer.data,
-                           meta.buffer.length,
+    if (file_handler.pread(storage_data,
+                           storage_length,
                            meta.file_header.size()) != OLAP_SUCCESS) {
         res = OLAP_ERR_IO_ERROR;
         OLAP_LOG_WARNING("load segment for loading index error. [file=%s; res=%d]", file, res);
         file_handler.close();
-        free(meta.buffer.data);
+        free(storage_data);
         return res;
     }
 
-    // calculate the total size of all segments
-    if (false == null_supported) {
-        _index_size += meta.file_header.file_length() + num_entries * num_short_key_fields;
-    } else {
-        _index_size += meta.file_header.file_length();
-    }
-    _data_size += meta.file_header.extra().data_length;
-    _num_rows += meta.file_header.extra().num_rows;
-
     // checksum validation
-    adler_checksum = olap_adler32(ADLER32_INIT, meta.buffer.data, meta.buffer.length);
+    adler_checksum = olap_adler32(ADLER32_INIT, storage_data, storage_length);
     if (adler_checksum != meta.file_header.checksum()) {
         res = OLAP_ERR_INDEX_CHECKSUM_ERROR;
         OLAP_LOG_WARNING("checksum validation error.");
         OLAP_LOG_WARNING("load segment for loading index error. [file=%s; res=%d]", file, res);
         file_handler.close();
-        free(meta.buffer.data);
+        free(storage_data);
         return res;
     }
 
-    if (false == null_supported) {
-        meta.buffer.length += num_entries * num_short_key_fields;
-        const RowFields& tablet_schema = short_key_fields();
-        size_t src = meta.buffer.length - num_entries * num_short_key_fields;
-        size_t dest = meta.buffer.length;
-        for (size_t i = 0; i < num_entries; ++i) {
-            memmove(meta.buffer.data + dest - sizeof(data_file_offset_t),
-                    meta.buffer.data + src - sizeof(data_file_offset_t),
-                    sizeof(data_file_offset_t));
-            dest = dest - sizeof(data_file_offset_t);
-            src = src - sizeof(data_file_offset_t);
-            for (size_t j = num_short_key_fields; j > 0; --j) {
-                size_t index_length = tablet_schema[j-1].index_length;
-                memmove(meta.buffer.data + dest - index_length, 
-                        meta.buffer.data + src - index_length, index_length);
-                *(meta.buffer.data + dest - index_length - 1) &= 0;
-                dest = dest - index_length - 1;
-                src = src - index_length;
+    /*
+     * convert storage layout to memory layout for olapindex
+     * In this procedure, string type(Varchar/Char) should be
+     * converted with caution. Hyperloglog type will not be
+     * key, it can not to be handled.
+     */
+
+    size_t storage_row_bytes = entry_length();
+    storage_row_bytes -= (null_supported ? 0 : num_short_key_fields);
+    char* storage_ptr = storage_data;
+    size_t storage_field_offset = 0;
+
+    size_t mem_row_bytes = new_entry_length();
+    char* mem_buf = reinterpret_cast(calloc(num_entries * mem_row_bytes, 1));
+    memset(mem_buf, 0, num_entries * mem_row_bytes);
+    char* mem_ptr = mem_buf;
+    size_t mem_field_offset = 0;
+
+    size_t null_byte = null_supported ? 1 : 0;
+    for (size_t i = 0; i < num_short_key_fields; ++i) {
+        storage_ptr = storage_data + storage_field_offset;
+        storage_field_offset += (*_fields)[i].index_length + null_byte;
+        mem_ptr = mem_buf + mem_field_offset;
+        if ((*_fields)[i].type == OLAP_FIELD_TYPE_VARCHAR) {
+            mem_field_offset += sizeof(StringSlice) + 1;
+            for (size_t j = 0; j < num_entries; ++j) {
+                /*
+                 * Varchar is null_byte|length|content in OlapIndex storage
+                 * Varchar is in nullbyte|length|ptr in memory
+                 * We need copy three part: nullbyte|length|content
+                 * 1. copy null byte
+                 * 2. copy length and content into addrs pointed by ptr
+                 */
+
+                // 1. copy null_byte
+                memory_copy(mem_ptr, storage_ptr, null_byte);
+
+                // 2. copy length and content
+                size_t storage_field_bytes =
+                    *reinterpret_cast(storage_ptr + null_byte);
+                StringSlice* slice = reinterpret_cast(mem_ptr + 1);
+                char* data = reinterpret_cast(_mem_pool->allocate(storage_field_bytes));
+                memory_copy(data, storage_ptr + sizeof(StringLengthType) + null_byte, storage_field_bytes);
+                slice->data = data;
+                slice->size = storage_field_bytes;
+
+                mem_ptr += mem_row_bytes;
+                storage_ptr += storage_row_bytes;
+            }
+        } else if ((*_fields)[i].type == OLAP_FIELD_TYPE_CHAR) {
+            mem_field_offset += sizeof(StringSlice) + 1;
+            size_t storage_field_bytes = (*_fields)[i].index_length;
+            for (size_t j = 0; j < num_entries; ++j) {
+                /*
+                 * Char is in nullbyte|content with fixed length in OlapIndex
+                 * Char is in nullbyte|length|ptr in memory
+                 * We need copy three part: nullbyte|length|content
+                 * 1. copy null byte
+                 * 2. copy length and content into addrs pointed by ptr
+                 */
+
+                // 1. copy null_byte
+                memory_copy(mem_ptr, storage_ptr, null_byte);
+
+                // 2. copy length and content
+                StringSlice* slice = reinterpret_cast(mem_ptr + 1);
+                char* data = reinterpret_cast(_mem_pool->allocate(storage_field_bytes));
+                memory_copy(data, storage_ptr + null_byte, storage_field_bytes);
+                slice->data = data;
+                slice->size = storage_field_bytes;
+
+                mem_ptr += mem_row_bytes;
+                storage_ptr += storage_row_bytes;
+            }
+        } else {
+            size_t storage_field_bytes = (*_fields)[i].index_length;
+            mem_field_offset += storage_field_bytes + 1;
+            for (size_t j = 0; j < num_entries; ++j) {
+                memory_copy(mem_ptr + 1 - null_byte, storage_ptr, storage_field_bytes + null_byte);
+
+                mem_ptr += mem_row_bytes;
+                storage_ptr += storage_row_bytes;
             }
         }
     }
 
-    meta.range.first = _num_entries;
-    meta.range.last = meta.range.first + num_entries;
-    _num_entries = meta.range.last;
-    _meta.push_back(meta);
+    mem_ptr = mem_buf + mem_field_offset;
+    storage_ptr = storage_data + storage_field_offset;
+    size_t data_file_offset = sizeof(data_file_offset_t);
+    for (size_t j = 0; j < num_entries; ++j) {
+        memory_copy(mem_ptr, storage_ptr, data_file_offset);
+        mem_ptr += mem_row_bytes;
+        storage_ptr += storage_row_bytes;
+    }
 
-    (current_num_rows_per_row_block == NULL
-         || (*current_num_rows_per_row_block = meta.file_header.message().num_rows_per_block())); 
+    _meta.back().buffer.data = mem_buf;
+    _meta.back().buffer.length = num_entries * mem_row_bytes;
+    free(storage_data);
 
     file_handler.close();
-
     return OLAP_SUCCESS;
 }
 
-OLAPStatus MemIndex::init(size_t short_key_len, size_t short_key_num, RowFields* fields) {
+OLAPStatus MemIndex::init(size_t short_key_len, size_t new_short_key_len,
+                          size_t short_key_num, RowFields* fields) {
     if (fields == NULL) {
         OLAP_LOG_WARNING("fail to init MemIndex, NULL short key fields.");
         return OLAP_ERR_INDEX_LOAD_ERROR;
     }
 
     _key_length = short_key_len;
+    _new_key_length = new_short_key_len;
     _key_num = short_key_num;
     _fields = fields;
 
@@ -912,24 +991,24 @@ const OLAPIndexOffset MemIndex::get_offset(const RowBlockPosition& pos) const {
     uint32_t file_header_size = _meta[pos.segment].file_header.size();
     if (pos.segment >= segment_count()
             || pos.index_offset > file_header_size + _meta[pos.segment].buffer.length
-            || (pos.index_offset - file_header_size) % entry_length() != 0) {
+            || (pos.index_offset - file_header_size) % new_entry_length() != 0) {
         return end();
     }
 
     OLAPIndexOffset off;
     off.segment = pos.segment;
-    off.offset = (pos.index_offset - _meta[pos.segment].file_header.size()) / entry_length();
+    off.offset = (pos.index_offset - _meta[pos.segment].file_header.size()) / new_entry_length();
 
     return off;
 }
 
-OLAPStatus MemIndex::get_entry(const OLAPIndexOffset& pos, Slice* slice) const {
+OLAPStatus MemIndex::get_entry(const OLAPIndexOffset& pos, EntrySlice* slice) const {
     if (pos.segment >= segment_count() || pos.offset >= _meta[pos.segment].count()) {
         return OLAP_ERR_INDEX_EOF;
     }
 
-    slice->length = entry_length();
-    slice->data = _meta[pos.segment].buffer.data + pos.offset * entry_length();
+    slice->length = new_entry_length();
+    slice->data = _meta[pos.segment].buffer.data + pos.offset * new_entry_length();
 
     return OLAP_SUCCESS;
 }
@@ -952,16 +1031,16 @@ OLAPStatus MemIndex::get_row_block_position(
 
     rbp->segment = pos.segment;
     rbp->data_offset = *reinterpret_cast(
-                           _meta[pos.segment].buffer.data +
-                           pos.offset * entry_length() + short_key_length());
-    rbp->index_offset = _meta[pos.segment].file_header.size() + pos.offset * entry_length();
+                            _meta[pos.segment].buffer.data +
+                            pos.offset * new_entry_length() + new_short_key_length());
+    rbp->index_offset = _meta[pos.segment].file_header.size() + pos.offset * new_entry_length();
 
     if (pos.offset == _meta[pos.segment].count() - 1) {
         rbp->block_size = _meta[pos.segment].file_header.extra().data_length - rbp->data_offset;
     } else {
         uint32_t next_offset = *reinterpret_cast(
                                    _meta[pos.segment].buffer.data +
-                                   (pos.offset + 1) * entry_length() + short_key_length());
+                                   (pos.offset + 1) * new_entry_length() + new_short_key_length());
         rbp->block_size = next_offset - rbp->data_offset;
     }
 
diff --git a/be/src/olap/olap_index.h b/be/src/olap/olap_index.h
index a8ff2b92ac..78318e9425 100644
--- a/be/src/olap/olap_index.h
+++ b/be/src/olap/olap_index.h
@@ -42,6 +42,7 @@ class OLAPTable;
 class RowBlock;
 class RowCursor;
 class SegmentComparator;
+class WrapperField;
 
 typedef uint32_t data_file_offset_t;
 typedef std::vector RowFields;
@@ -53,9 +54,10 @@ struct OLAPIndexFixedHeader {
     uint64_t num_rows;
 };
 
-struct Slice {
+struct EntrySlice {
     char* data;
     size_t length;
+    EntrySlice() : data(nullptr), length(0) {}
 };
 
 // Range of offset in one segment
@@ -87,8 +89,8 @@ struct RowBlockPosition {
 
     bool operator==(const RowBlockPosition& other) const {
         return (segment == other.segment
-                    && block_size == other.block_size
                     && data_offset == other.data_offset
+                    && block_size == other.block_size
                     && index_offset == other.index_offset);
     }
 
@@ -156,7 +158,7 @@ struct SegmentMetaInfo {
     }
 
     IDRange     range;
-    Slice       buffer;
+    EntrySlice       buffer;
     FileHeader  file_header;
 };
 
@@ -167,16 +169,12 @@ public:
     friend class IndexComparator;
     friend class SegmentComparator;
 
-    explicit MemIndex() :
-            _key_length(0),
-            _num_entries(0),
-            _index_size(0),
-            _data_size(0),
-            _num_rows(0) {}
+    MemIndex();
     ~MemIndex();
 
     // åˆå§‹åŒ–MemIndex, ä¼ å…¥short_key的总长度和对应的Field数组
-    OLAPStatus init(size_t short_key_len, size_t short_key_num, RowFields* fields);
+    OLAPStatus init(size_t short_key_len, size_t new_short_key_len,
+                    size_t short_key_num, RowFields* fields);
 
     // 加载一个segment到内存
     OLAPStatus load_segment(const char* file, size_t *current_num_rows_per_row_block);
@@ -248,7 +246,7 @@ public:
     const OLAPIndexOffset get_relative_offset(iterator_offset_t absolute_offset) const;
 
     // Return content of index item, which IndexOffset is pos
-    OLAPStatus get_entry(const OLAPIndexOffset& pos, Slice* slice) const;
+    OLAPStatus get_entry(const OLAPIndexOffset& pos, EntrySlice* slice) const;
 
     // Return RowBlockPosition from IndexOffset
     OLAPStatus get_row_block_position(const OLAPIndexOffset& pos, RowBlockPosition* rbp) const;
@@ -263,12 +261,20 @@ public:
         return _key_length;
     }
 
+    const size_t new_short_key_length() const {
+        return _new_key_length;
+    }
+
     // Return length of full index item,
     // which actually equals to short_key_length() plus sizeof(data_file_offset_t)
     const size_t entry_length() const {
         return short_key_length() + sizeof(data_file_offset_t);
     }
 
+    const size_t new_entry_length() const {
+        return _new_key_length + sizeof(data_file_offset_t);
+    }
+
     // Return short key FieldInfo array
     const RowFields& short_key_fields() const {
         return *_fields;
@@ -319,6 +325,7 @@ public:
 private:
     std::vector _meta;
     size_t _key_length;
+    size_t _new_key_length;
     size_t _key_num;
     size_t _num_entries;
     size_t _index_size;
@@ -326,6 +333,8 @@ private:
     size_t _num_rows;
     RowFields*  _fields;
 
+    std::unique_ptr _tracker;
+    std::unique_ptr _mem_pool;
     DISALLOW_COPY_AND_ASSIGN(MemIndex);
 };
 
@@ -361,13 +370,11 @@ private:
     bool _compare(const iterator_offset_t& index,
                   const RowCursor& key,
                   ComparatorEnum comparator) {
-        Slice slice;
+        EntrySlice slice;
         OLAPIndexOffset offset(_cur_seg, index);
         _index->get_entry(offset, &slice);
 
-        if (_helper_cursor->attach(slice.data, _index->short_key_length()) != OLAP_SUCCESS) {
-            throw ComparatorException();
-        }
+        _helper_cursor->attach(slice.data);
 
         if (comparator == COMPARATOR_LESS) {
             return _helper_cursor->index_cmp(key) < 0;
@@ -403,13 +410,12 @@ private:
     bool _compare(const iterator_offset_t& index,
                   const RowCursor& key,
                   ComparatorEnum comparator) {
-        Slice slice;
+        EntrySlice slice;
         slice.data = _index->_meta[index].buffer.data;
-        slice.length = _index->short_key_length();
+        //slice.length = _index->short_key_length();
+        slice.length = _index->new_short_key_length();
 
-        if (_helper_cursor->attach(slice.data, _index->short_key_length()) != OLAP_SUCCESS) {
-            throw ComparatorException();
-        }
+        _helper_cursor->attach(slice.data);
 
         if (comparator == COMPARATOR_LESS) {
             return _helper_cursor->index_cmp(key) < 0;
@@ -448,9 +454,10 @@ public:
         return _inited_column_statistics;
     }
 
-    OLAPStatus set_column_statistics(std::vector > &column_statistics);
+    OLAPStatus set_column_statistics(
+        const std::vector>& column_statistics);
     
-    std::vector> &get_column_statistics() {
+    const std::vector>& get_column_statistics() {
         return _column_statistics;
     }
 
@@ -499,7 +506,7 @@ public:
 
     OLAPStatus find_prev_point(const RowBlockPosition& current, RowBlockPosition* prev) const;
 
-    OLAPStatus get_row_block_entry(const RowBlockPosition& pos, Slice* entry) const;
+    OLAPStatus get_row_block_entry(const RowBlockPosition& pos, EntrySlice* entry) const;
 
     // Given a starting row block position, advances the position by
     // num_row_blocks, then stores back the new position through the
@@ -577,6 +584,10 @@ public:
     const size_t short_key_length() const {
         return _short_key_length;
     }
+
+    const size_t new_short_key_length() const {
+        return _new_short_key_length;
+    }
     
     const RowFields& short_key_fields() const {
         return _short_key_info_list;
@@ -643,6 +654,7 @@ private:
     RowFields _short_key_info_list;
     // short key对应的总长度
     size_t _short_key_length;
+    size_t _new_short_key_length;
 
     // 以下是写入æµç¨‹æ—¶éœ€è¦çš„一些中间状æ€
     // 当å‰å†™å…¥æ–‡ä»¶çš„FileHandler
@@ -663,7 +675,7 @@ private:
 
     bool _inited_column_statistics;
 
-    std::vector > _column_statistics;
+    std::vector> _column_statistics;
     std::vector _has_null_flags;
     std::unordered_map > _seg_pb_map;
 
diff --git a/be/src/olap/olap_reader.cpp b/be/src/olap/olap_reader.cpp
index c7db569ac6..c6e55afcae 100644
--- a/be/src/olap/olap_reader.cpp
+++ b/be/src/olap/olap_reader.cpp
@@ -29,14 +29,6 @@ using std::nothrow;
 
 namespace palo {
 
-void OLAPReader::init_profile(RuntimeProfile* profile) {
-    ADD_TIMER(profile, "GetTabletTime");
-    ADD_TIMER(profile, "InitReaderTime");
-    ADD_TIMER(profile, "ReadDataTime");
-    ADD_TIMER(profile, "ShowHintsTime");
-    ADD_COUNTER(profile, "RawRowsRead", TUnit::UNIT);
-}
-
 Status OLAPShowHints::show_hints(
         TShowHintsRequest& fetch_request,
         std::vector>>* ranges, 
@@ -108,273 +100,4 @@ Status OLAPShowHints::show_hints(
     return Status::OK;
 }
 
-OLAPReader *OLAPReader::create(const TupleDescriptor &tuple_desc, RuntimeState* runtime_state) {
-    return new (std::nothrow) OLAPReader(tuple_desc, runtime_state);
-}
-
-Status OLAPReader::init(TFetchRequest& fetch_request, 
-                        std::vector *conjunct_ctxs,
-                        RuntimeProfile* profile) {
-    OLAP_LOG_DEBUG("fetch request:%s", apache::thrift::ThriftDebugString(fetch_request).c_str());
-
-    if (PaloMetrics::palo_fetch_count() != NULL) {
-        PaloMetrics::palo_fetch_count()->increment(1);
-    }
-
-    OLAPStatus res = OLAP_SUCCESS;
-
-    _get_tablet_timer = profile->get_counter("GetTabletTime");
-    _init_reader_timer = profile->get_counter("InitReaderTime");
-
-    _conjunct_ctxs = conjunct_ctxs;
-    _aggregation = fetch_request.aggregation;
-
-    {
-        SCOPED_TIMER(_get_tablet_timer);
-        _olap_table = OLAPEngine::get_instance()->get_table(
-                fetch_request.tablet_id, fetch_request.schema_hash);
-        if (_olap_table.get() == NULL) {
-            OLAP_LOG_WARNING("table does not exists. [tablet_id=%ld schema_hash=%d]",
-                             fetch_request.tablet_id, fetch_request.schema_hash);
-            return Status("table does not exists");
-        }
-    }
-
-    {
-        AutoRWLock auto_lock(_olap_table->get_header_lock_ptr(), true);
-        const FileVersionMessage* message = _olap_table->latest_version();
-        if (message == NULL) {
-            OLAP_LOG_WARNING("fail to get latest version. [tablet_id=%ld]",
-                             fetch_request.tablet_id);
-            return Status("fail to get latest version");
-        }
-
-        if (message->end_version() == fetch_request.version
-                && message->version_hash() != fetch_request.version_hash) {
-            OLAP_LOG_WARNING("fail to check latest version hash. "
-                             "[res=%d tablet_id=%ld version_hash=%ld request_version_hash=%ld]",
-                             res, fetch_request.tablet_id,
-                             message->version_hash(), fetch_request.version_hash);
-            return Status("fail to check version hash");
-        }
-    }
-
-    {
-        SCOPED_TIMER(_init_reader_timer);
-        res = _init_params(fetch_request, profile);
-        if (res != OLAP_SUCCESS) {
-            OLAP_LOG_WARNING("fail to init olap reader.[res=%d]", res);
-            return Status("fail to init olap reader");
-        }
-    }
-
-    _is_inited = true;
-    _read_data_watch.reset();
-    return Status::OK;
-}
-
-Status OLAPReader::close() {
-    return Status::OK;
-}
-
-Status OLAPReader::next_tuple(Tuple* tuple, int64_t* raw_rows_read, bool* eof) {
-    OLAPStatus res = OLAP_SUCCESS;
-    res = _reader.next_row_with_aggregation(&_read_row_cursor, raw_rows_read, eof);
-    if (res != OLAP_SUCCESS) {
-        OLAP_LOG_WARNING("fail to get new row.[res=%d]", res);
-        return Status("fail to get new row");
-    }
-
-    if (!*eof) {
-        res = _convert_row_to_tuple(tuple);
-        if (res != OLAP_SUCCESS) {
-            OLAP_LOG_WARNING("fail to convert row to tuple.[res=%d]", res);
-            return Status("fail to convert row to tuple");
-        }
-    }
-
-    return Status::OK;
-}
-
-OLAPStatus OLAPReader::_convert_row_to_tuple(Tuple* tuple) {
-    RowCursor *row_cursor = NULL;
-    if (_aggregation) {
-        row_cursor = &_read_row_cursor;
-    } else {
-        _return_row_cursor.copy(_read_row_cursor);
-        row_cursor = &_return_row_cursor;
-    }
-
-    int offset = 0;
-    size_t slots_size = _query_slots.size();
-    //offset += row_cursor->get_num_null_byte();
-    for (int i = 0; i < slots_size; ++i)
-    {
-        offset += sizeof(char);
-        if (true == row_cursor->is_null_converted(i)) {
-            tuple->set_null(_query_slots[i]->null_indicator_offset());
-            offset += _request_columns_size[i];
-            if (TYPE_VARCHAR == _query_slots[i]->type().type || TYPE_HLL == _query_slots[i]->type().type) {
-                offset += 2; //row_cursor has an uint16_t storage space for NULL varchar
-            }
-            continue;
-        }
-
-        switch (_query_slots[i]->type().type) {
-        case TYPE_CHAR: {
-            StringValue *slot = tuple->get_string_slot(_query_slots[i]->tuple_offset());
-            slot->ptr = reinterpret_cast(
-                    const_cast(row_cursor->get_buf() + offset));
-            slot->len = strnlen(slot->ptr, _request_columns_size[i]);
-            offset += _request_columns_size[i];
-            break;
-        }
-        case TYPE_VARCHAR:
-        case TYPE_HLL: {
-            StringValue *slot = tuple->get_string_slot(_query_slots[i]->tuple_offset());
-            size_t size = *reinterpret_cast
-                    (const_cast(row_cursor->get_buf() + offset));
-            offset += 2;
-            slot->ptr = reinterpret_cast
-                    (const_cast(row_cursor->get_buf() + offset));
-            slot->len = size;
-            offset += _request_columns_size[i];
-            break;
-        }
-        case TYPE_DECIMAL: {
-            // DecimalValue *slot = tuple->get_decimal_slot(_tuple_desc.slots()[i]->tuple_offset());
-            DecimalValue *slot = tuple->get_decimal_slot(_query_slots[i]->tuple_offset());
-
-            // TODO(lingbin): should remove this assign, use set member function
-            int64_t int_value = *reinterpret_cast(
-                    const_cast(row_cursor->get_buf() + offset));
-            offset += sizeof(int64_t);
-            int32_t frac_value = *reinterpret_cast(
-                    const_cast(row_cursor->get_buf() + offset));
-            offset += sizeof(int32_t);
-            *slot = DecimalValue(int_value, frac_value);
-            break;
-        }
-
-        case TYPE_DATETIME: {
-            DateTimeValue *slot = tuple->get_datetime_slot(
-                    _query_slots[i]->tuple_offset());
-            uint64_t value = *reinterpret_cast(
-                    const_cast(row_cursor->get_buf() + offset));
-            if (!slot->from_olap_datetime(value)) {
-                tuple->set_null(_query_slots[i]->null_indicator_offset());
-            }
-            offset += 8;
-            break;
-        }
-        case TYPE_DATE: {
-            DateTimeValue *slot = tuple->get_datetime_slot(
-                    _query_slots[i]->tuple_offset());
-            uint64_t value = 0;
-            value = *(unsigned char*)(row_cursor->get_buf() + offset + 2);
-            value <<= 8;
-            value |= *(unsigned char*)(row_cursor->get_buf() + offset + 1);
-            value <<= 8;
-            value |= *(unsigned char*)(row_cursor->get_buf() + offset);
-            if (!slot->from_olap_date(value)) {
-                tuple->set_null(_query_slots[i]->null_indicator_offset());
-            }
-            offset += 3;
-            break;
-        }
-        default: {
-            void *slot = tuple->get_slot(_query_slots[i]->tuple_offset());
-            memory_copy(slot, row_cursor->get_buf() + offset, _request_columns_size[i]);
-            offset += _request_columns_size[i];
-            break;
-        }
-        }
-    }
-
-    return OLAP_SUCCESS;
-}
-
-OLAPStatus OLAPReader::_init_params(TFetchRequest& fetch_request, RuntimeProfile* profile) {
-    OLAPStatus res = OLAP_SUCCESS;
-
-    res = _init_return_columns(fetch_request);
-    if (res != OLAP_SUCCESS) {
-        OLAP_LOG_WARNING("fail to init return columns.[res=%d]", res);
-        return res;
-    }
-
-    ReaderParams reader_params;
-    reader_params.olap_table = _olap_table;
-    reader_params.reader_type = READER_FETCH;
-    reader_params.aggregation = fetch_request.aggregation;
-    reader_params.version = Version(0, fetch_request.version);
-    reader_params.conditions = fetch_request.where;
-    reader_params.range = fetch_request.range;
-    reader_params.end_range = fetch_request.end_range;
-    reader_params.start_key = fetch_request.start_key;
-    reader_params.end_key = fetch_request.end_key;
-    reader_params.conjunct_ctxs = _conjunct_ctxs;
-    reader_params.profile = profile;
-    reader_params.runtime_state = _runtime_state;
-
-    if (_aggregation) {
-        reader_params.return_columns = _return_columns;
-    } else {
-        for (size_t i = 0; i < _olap_table->num_key_fields(); ++i) {
-            reader_params.return_columns.push_back(i);
-        }
-        for (size_t i = 0; i < fetch_request.field.size(); ++i) {
-            int32_t index = _olap_table->get_field_index(fetch_request.field[i]);
-            if (_olap_table->tablet_schema()[index].is_key) {
-                continue;
-            } else {
-                reader_params.return_columns.push_back(index);
-            }
-        }
-    }
-    res = _read_row_cursor.init(_olap_table->tablet_schema(), reader_params.return_columns);
-    if (res != OLAP_SUCCESS) {
-        OLAP_LOG_WARNING("fail to init row cursor.[res=%d]", res);
-        return res;
-    }
-    res = _reader.init(reader_params);
-    if (res != OLAP_SUCCESS) {
-        OLAP_LOG_WARNING("fail to init reader.[res=%d]", res);
-        return res;
-    }
-
-    for (int i = 0; i < _tuple_desc.slots().size(); ++i) {
-        if (!_tuple_desc.slots()[i]->is_materialized()) {
-            continue;
-        }
-        _query_slots.push_back(_tuple_desc.slots()[i]);
-    }
-
-    return res;
-}
-
-OLAPStatus OLAPReader::_init_return_columns(TFetchRequest& fetch_request) {
-    for (int32_t i = 0, len = fetch_request.field.size(); i < len; i++) {
-        int32_t index = _olap_table->get_field_index(fetch_request.field[i]);
-        if (index < 0) {
-            OLAP_LOG_WARNING("field name is invalied. [index=%d field='%s']",
-                             index,
-                             fetch_request.field[i].c_str());
-            return OLAP_ERR_FETCH_GET_READER_PARAMS_ERR;
-        }
-
-        _return_columns.push_back(index);
-        if (_olap_table->tablet_schema()[index].type == OLAP_FIELD_TYPE_VARCHAR || _olap_table->tablet_schema()[index].type == OLAP_FIELD_TYPE_HLL) {
-            _request_columns_size.push_back(_olap_table->tablet_schema()[index].length - 
-                    sizeof(VarCharField::LengthValueType));
-        } else {
-            _request_columns_size.push_back(_olap_table->tablet_schema()[index].length);
-        }
-    }
-
-    _return_row_cursor.init(_olap_table->tablet_schema(), _return_columns);
-
-    return OLAP_SUCCESS;
-}
-
 }  // namespace palo
diff --git a/be/src/olap/olap_reader.h b/be/src/olap/olap_reader.h
index 0c773219e3..2979c84ad7 100644
--- a/be/src/olap/olap_reader.h
+++ b/be/src/olap/olap_reader.h
@@ -20,11 +20,11 @@
 #include 
 
 #include "common/object_pool.h"
+#include "common/status.h"
 #include "olap/delete_handler.h"
 #include "olap/i_data.h"
 #include "olap/olap_cond.h"
 #include "olap/olap_engine.h"
-#include "util/palo_metrics.h"
 #include "olap/reader.h"
 
 namespace palo {
@@ -37,84 +37,6 @@ public:
             RuntimeProfile* profile);
 };
 
-class OLAPReader {
-public:
-    static void init_profile(RuntimeProfile* profile);
-    static OLAPReader* create(const TupleDescriptor &tuple_desc, RuntimeState* runtime_state);
-
-    explicit OLAPReader(const TupleDescriptor &tuple_desc) :
-            _tuple_desc(tuple_desc),
-            _conjunct_ctxs(nullptr),
-            _is_inited(false),
-            _request_version(-1),
-            _aggregation(false),
-            _get_tablet_timer(nullptr),
-            _init_reader_timer(nullptr),
-            _read_data_timer(nullptr),
-            _runtime_state(nullptr) {}
-
-    OLAPReader(const TupleDescriptor &tuple_desc, RuntimeState* runtime_state) :
-            _tuple_desc(tuple_desc),
-            _conjunct_ctxs(nullptr),
-            _is_inited(false),
-            _request_version(-1),
-            _aggregation(false),
-            _get_tablet_timer(nullptr),
-            _init_reader_timer(nullptr),
-            _read_data_timer(nullptr),
-            _runtime_state(runtime_state) {}
-
-    ~OLAPReader() {
-        close();
-    }
-
-    Status init(TFetchRequest& fetch_request,
-                std::vector *conjunct_ctxs,
-                RuntimeProfile* profile);
-
-    Status close();
-
-    Status next_tuple(Tuple *tuple, int64_t* raw_rows_read, bool* eof);
-    
-private: 
-    OLAPStatus _init_params(TFetchRequest& fetch_request, RuntimeProfile* profile);
-
-    OLAPStatus _init_return_columns(TFetchRequest& fetch_request);
-
-    OLAPStatus _convert_row_to_tuple(Tuple* tuple);
-
-    Reader _reader;
-
-    const TupleDescriptor &_tuple_desc;
-
-    std::vector *_conjunct_ctxs;
-
-    bool _is_inited;
-
-    int32_t _request_version;
-
-    bool _aggregation;
-
-    SmartOLAPTable _olap_table;
-
-    std::vector _return_columns;
-
-    RowCursor _read_row_cursor;
-
-    RowCursor _return_row_cursor;
-
-    std::vector _request_columns_size;
-
-    std::vector _query_slots;
-
-    // time costed and row returned statistics
-    RuntimeProfile::Counter* _get_tablet_timer;
-    RuntimeProfile::Counter* _init_reader_timer;
-    OlapStopWatch _read_data_watch;
-    RuntimeProfile::Counter* _read_data_timer;
-    RuntimeState* _runtime_state;
-};
-
 }  // namespace palo
 
 #endif // BDG_PALO_BE_SRC_OLAP_OLAP_READER_H
diff --git a/be/src/olap/olap_rootpath.cpp b/be/src/olap/olap_rootpath.cpp
index 4c9f3a4b53..f9ad50fc70 100644
--- a/be/src/olap/olap_rootpath.cpp
+++ b/be/src/olap/olap_rootpath.cpp
@@ -127,7 +127,7 @@ OLAPStatus OLAPRootPath::init() {
         _remove_all_unused_flag_file();
     }
 
-    res = _parse_root_paths_from_string(root_paths.c_str(), &root_path_vec, &capacity_vec);
+    res = parse_root_paths_from_string(root_paths.c_str(), &root_path_vec, &capacity_vec);
     if (res != OLAP_SUCCESS) {
         OLAP_LOG_WARNING("get root path failed. [res=%d root_paths='%s']",
                          res, root_paths.c_str());
@@ -305,7 +305,7 @@ OLAPStatus OLAPRootPath::reload_root_paths(const char* root_paths) {
 
     RootPathVec root_path_vec;
     CapacityVec capacity_vec;
-    res = _parse_root_paths_from_string(root_paths, &root_path_vec, &capacity_vec);
+    res = parse_root_paths_from_string(root_paths, &root_path_vec, &capacity_vec);
     if (res != OLAP_SUCCESS) {
         OLAP_LOG_WARNING("get root path failed when reload root path. [root_paths=%s]", root_paths);
         return res;
@@ -654,7 +654,7 @@ OLAPStatus OLAPRootPath::_create_unused_flag_file(string& unused_flag_file) {
     return res;
 }
 
-OLAPStatus OLAPRootPath::_parse_root_paths_from_string(
+OLAPStatus OLAPRootPath::parse_root_paths_from_string(
         const char* root_paths,
         RootPathVec* root_path_vec,
         CapacityVec* capacity_vec) {
diff --git a/be/src/olap/olap_rootpath.h b/be/src/olap/olap_rootpath.h
index a7c1666468..7221a1fd4d 100644
--- a/be/src/olap/olap_rootpath.h
+++ b/be/src/olap/olap_rootpath.h
@@ -60,6 +60,11 @@ public:
     typedef std::vector RootPathVec;
     typedef std::vector CapacityVec;
 
+    static OLAPStatus parse_root_paths_from_string(
+            const char* root_paths,
+            RootPathVec* root_path_vec,
+            CapacityVec* capacity_vec);
+
     // @brief åˆå§‹åŒ–。
     // 从é…置文件中读å–storage_root_pathä¿¡æ¯ï¼Œé‡åçš„path当æˆä¸€æ¡path,
     // 校验å„root_path的目录ã€ç£ç›˜ç­‰ã€‚
@@ -170,11 +175,6 @@ private:
             CapacityVec* capacity_vec,
             std::vector* is_accessable_vec);
 
-    OLAPStatus _parse_root_paths_from_string(
-            const char* root_paths,
-            RootPathVec* root_path_vec,
-            CapacityVec* capacity_vec);
-
     OLAPStatus _get_root_path_capacity(
             const std::string& root_path,
             int64_t* data_used);
diff --git a/be/src/olap/olap_server.cpp b/be/src/olap/olap_server.cpp
index 87d5efb2fd..39a1fa92fc 100644
--- a/be/src/olap/olap_server.cpp
+++ b/be/src/olap/olap_server.cpp
@@ -25,7 +25,7 @@
 #include 
 
 #include "olap/command_executor.h"
-#include "olap/cumulative_handler.h"
+#include "olap/cumulative_compaction.h"
 #include "olap/olap_common.h"
 #include "olap/olap_define.h"
 #include "olap/olap_engine.h"
@@ -86,24 +86,24 @@ OLAPStatus OLAPServer::init(const char* config_path, const char* config_file) {
     }
 
     // start be and ce threads for merge data
-    int32_t be_thread_num = config::base_expansion_thread_num;
-    _be_threads.resize(be_thread_num, -1);
-    for (uint32_t i = 0; i < be_thread_num; ++i) {
-        if (0 != pthread_create(&_be_threads[i],
+    int32_t base_compaction_num_threads = config::base_compaction_num_threads;
+    _base_compaction_threads.resize(base_compaction_num_threads, -1);
+    for (uint32_t i = 0; i < base_compaction_num_threads; ++i) {
+        if (0 != pthread_create(&_base_compaction_threads[i],
                                 NULL,
-                                _be_thread_callback,
+                                _base_compaction_thread_callback,
                                 NULL)) {
-            OLAP_LOG_FATAL("failed to start base expansion thread. [id=%u]", i); 
+            OLAP_LOG_FATAL("failed to start base compaction thread. [id=%u]", i); 
             return OLAP_ERR_INIT_FAILED;
         }
     }
 
-    int32_t ce_thread_num = config::cumulative_thread_num;
-    _cumulative_threads.resize(ce_thread_num, -1);
-    for (uint32_t i = 0; i < ce_thread_num; ++i) {
-        if (0 != pthread_create(&_cumulative_threads[i], 
+    int32_t cumulative_compaction_num_threads = config::cumulative_compaction_num_threads;
+    _cumulative_compaction_threads.resize(cumulative_compaction_num_threads, -1);
+    for (uint32_t i = 0; i < cumulative_compaction_num_threads; ++i) {
+        if (0 != pthread_create(&(_cumulative_compaction_threads[i]), 
                                 NULL, 
-                                _cumulative_thread_callback, 
+                                _cumulative_compaction_thread_callback, 
                                 NULL)) {
             OLAP_LOG_FATAL("failed to start cumulative thread. [id=%u]", i); 
             return OLAP_ERR_INIT_FAILED;
@@ -125,7 +125,7 @@ void* OLAPServer::_fd_cache_clean_callback(void* arg) {
 #endif
     uint32_t interval = config::file_descriptor_cache_clean_interval;
     if (interval <= 0) {
-        OLAP_LOG_WARNING("base expansion triggler interval config is illegal: [%d], "
+        OLAP_LOG_WARNING("config of file descriptor clean interval is illegal: [%d], "
                          "force set to 3600", interval);
         interval = 3600;
     }
@@ -137,25 +137,25 @@ void* OLAPServer::_fd_cache_clean_callback(void* arg) {
     return NULL;
 }
 
-void* OLAPServer::_be_thread_callback(void* arg) {
+void* OLAPServer::_base_compaction_thread_callback(void* arg) {
 #ifdef GOOGLE_PROFILER
     ProfilerRegisterThread();
 #endif
-    uint32_t interval = config::base_expansion_trigger_interval;
+    uint32_t interval = config::base_compaction_check_interval_seconds;
     if (interval <= 0) {
-        OLAP_LOG_WARNING("base expansion triggler interval config is illegal: [%d], "
+        OLAP_LOG_WARNING("base compaction check interval config is illegal: [%d], "
                          "force set to 1", interval);
         interval = 1;
     }
 
-    string last_be_fs;
-    TTabletId last_be_tablet_id = -1;
+    string last_base_compaction_fs;
+    TTabletId last_base_compaction_tablet_id = -1;
     while (true) {
         // must be here, because this thread is start on start and
         // cgroup is not initialized at this time
         // add tid to cgroup
         CgroupsMgr::apply_system_cgroup();
-        OLAPEngine::get_instance()->start_base_expansion(&last_be_fs, &last_be_tablet_id);
+        OLAPEngine::get_instance()->start_base_compaction(&last_base_compaction_fs, &last_base_compaction_tablet_id);
 
         usleep(interval * 1000000);
     }
@@ -253,14 +253,14 @@ void* OLAPServer::_unused_index_thread_callback(void* arg) {
     return NULL;
 }
 
-void* OLAPServer::_cumulative_thread_callback(void* arg) {
+void* OLAPServer::_cumulative_compaction_thread_callback(void* arg) {
 #ifdef GOOGLE_PROFILER
     ProfilerRegisterThread();
 #endif
-    OLAP_LOG_INFO("try to start cumulative process!");
-    uint32_t interval = config::cumulative_check_interval;
+    OLAP_LOG_INFO("try to start cumulative compaction process!");
+    uint32_t interval = config::cumulative_compaction_check_interval_seconds;
     if (interval <= 0) {
-        OLAP_LOG_WARNING("cumulative expansion check interval config is illegal: [%d], "
+        OLAP_LOG_WARNING("cumulative compaction check interval config is illegal: [%d], "
                          "force set to 1", interval);
         interval = 1;
     }
diff --git a/be/src/olap/olap_server.h b/be/src/olap/olap_server.h
index 39ea4febf3..6fbd76cc94 100644
--- a/be/src/olap/olap_server.h
+++ b/be/src/olap/olap_server.h
@@ -35,8 +35,8 @@ public:
 private:
     // Thread functions
 
-    // base expansion thread process function
-    static void* _be_thread_callback(void* arg);
+    // base compaction thread process function
+    static void* _base_compaction_thread_callback(void* arg);
 
     // garbage sweep thread process function. clear snapshot and trash folder
     static void* _garbage_sweeper_thread_callback(void* arg);
@@ -48,7 +48,7 @@ private:
     static void* _unused_index_thread_callback(void* arg);
 
     // cumulative process function
-    static void* _cumulative_thread_callback(void* arg);
+    static void* _cumulative_compaction_thread_callback(void* arg);
 
     // clean file descriptors cache
     static void* _fd_cache_clean_callback(void* arg);
@@ -78,11 +78,11 @@ private:
     static MutexLock _s_session_timeout_mutex;
     static Condition _s_session_timeout_cond;
 
-    // thread to run base expansion
-    std::vector _be_threads;
+    // thread to run base compaction
+    std::vector _base_compaction_threads;
 
     // thread to check cumulative
-    std::vector _cumulative_threads;
+    std::vector _cumulative_compaction_threads;
 
     pthread_t _fd_cache_clean_thread;
 
diff --git a/be/src/olap/olap_snapshot.cpp b/be/src/olap/olap_snapshot.cpp
index 08a6769719..4594e30ef7 100644
--- a/be/src/olap/olap_snapshot.cpp
+++ b/be/src/olap/olap_snapshot.cpp
@@ -181,8 +181,7 @@ void OLAPSnapshot::_update_header_file_info(
                     shortest_versions[i].index_size,
                     shortest_versions[i].data_size,
                     shortest_versions[i].num_rows,
-                    const_cast >*> \
-                    (&shortest_versions[i].column_statistics));
+                    &shortest_versions[i].column_statistics);
         }
     }
 }
diff --git a/be/src/olap/olap_table.cpp b/be/src/olap/olap_table.cpp
index 5aeb484a22..57b6f3af66 100644
--- a/be/src/olap/olap_table.cpp
+++ b/be/src/olap/olap_table.cpp
@@ -455,9 +455,8 @@ OLAPStatus OLAPTable::release_data_sources(vector* data_sources) const {
         return OLAP_ERR_INPUT_PARAMETER_ERROR;
     }
 
-    for (vector::iterator it = data_sources->begin(); it != data_sources->end(); ++it) {
-        delete(*it);
-        *it = NULL;
+    for (auto data : *data_sources) {
+        delete data;
     }
 
     // clear data_sources vector
@@ -685,7 +684,7 @@ OLAPStatus OLAPTable::compute_all_versions_hash(const vector& versions,
 }
 
 OLAPStatus OLAPTable::get_selectivities(vector* selectivities) {
-    // num_rows and selectivities are calculated when loading and base expansioning.
+    // num_rows and selectivities are calculated when loading and base compactioning.
     if (selectivities == NULL) {
         OLAP_LOG_WARNING("parameter num_rows or selectivity is null.");
         return OLAP_ERR_INPUT_PARAMETER_ERROR;
@@ -733,7 +732,7 @@ OLAPStatus OLAPTable::split_range(
         return OLAP_ERR_INPUT_PARAMETER_ERROR;
     }
 
-    Slice entry;
+    EntrySlice entry;
     RowCursor start_key;
     RowCursor end_key;
     RowCursor helper_cursor;
@@ -749,7 +748,7 @@ OLAPStatus OLAPTable::split_range(
 
     // 如果有startkey,用startkeyåˆå§‹åŒ–ï¼›å之则用minkeyåˆå§‹åŒ–
     if (start_key_strings.size() > 0) {
-        if (start_key.init_keys(_tablet_schema, start_key_strings) != OLAP_SUCCESS) {
+        if (start_key.init_scan_key(_tablet_schema, start_key_strings) != OLAP_SUCCESS) {
             OLAP_LOG_WARNING("fail to initial key strings with RowCursor type.");
             return OLAP_ERR_INIT_FAILED;
         }
@@ -764,12 +763,13 @@ OLAPStatus OLAPTable::split_range(
             return OLAP_ERR_INIT_FAILED;
         }
 
+        start_key.allocate_memory_for_string_type(_tablet_schema);
         start_key.build_min_key();
     }
 
     // å’Œstartkey一样处ç†ï¼Œæ²¡æœ‰åˆ™ç”¨maxkeyåˆå§‹åŒ–
     if (end_key_strings.size() > 0) {
-        if (OLAP_SUCCESS != end_key.init_keys(_tablet_schema, end_key_strings)) {
+        if (OLAP_SUCCESS != end_key.init_scan_key(_tablet_schema, end_key_strings)) {
             OLAP_LOG_WARNING("fail to parse strings to key with RowCursor type.");
             return OLAP_ERR_INVALID_SCHEMA;
         }
@@ -784,6 +784,7 @@ OLAPStatus OLAPTable::split_range(
             return OLAP_ERR_INIT_FAILED;
         }
 
+        end_key.allocate_memory_for_string_type(_tablet_schema);
         end_key.build_max_key();
     }
 
@@ -844,8 +845,9 @@ OLAPStatus OLAPTable::split_range(
         return OLAP_ERR_ROWBLOCK_FIND_ROW_EXCEPTION;
     }
 
-    cur_start_key.attach(entry.data, entry.length);
-    last_start_key.copy(cur_start_key);
+    cur_start_key.attach(entry.data);
+    last_start_key.allocate_memory_for_string_type(_tablet_schema);
+    last_start_key.copy_without_pool(cur_start_key);
     // start_key是last start_key, 但返回的实际上是查询层给出的key
     ranges->push_back(start_key.to_string_vector());
 
@@ -862,12 +864,12 @@ OLAPStatus OLAPTable::split_range(
             OLAP_LOG_WARNING("get block entry failed.");
             return OLAP_ERR_ROWBLOCK_FIND_ROW_EXCEPTION;
         }
-        cur_start_key.attach(entry.data, entry.length);
+        cur_start_key.attach(entry.data);
 
         if (cur_start_key.cmp(last_start_key) != 0) {
             ranges->push_back(cur_start_key.to_string_vector()); // end of last section
             ranges->push_back(cur_start_key.to_string_vector()); // start a new section
-            last_start_key.copy(cur_start_key);
+            last_start_key.copy_without_pool(cur_start_key);
         }
     }
 
diff --git a/be/src/olap/olap_table.h b/be/src/olap/olap_table.h
index 7e3bf244b8..cf0974f486 100644
--- a/be/src/olap/olap_table.h
+++ b/be/src/olap/olap_table.h
@@ -42,15 +42,15 @@ class RowBlockPosition;
 // Define OLAPTable's shared_ptr. It is used for
 typedef std::shared_ptr SmartOLAPTable;
 
-enum BaseExpansionStage {
-    BASE_EXPANSION_WAITING = 0,
-    BASE_EXPANSION_RUNNING = 1,
+enum BaseCompactionStage {
+    BASE_COMPACTION_WAITING = 0,
+    BASE_COMPACTION_RUNNING = 1,
 };
 
-struct BaseExpansionStatus {
-    BaseExpansionStatus() : status(BASE_EXPANSION_WAITING), version(-1) {}
+struct BaseCompactionStatus {
+    BaseCompactionStatus() : status(BASE_COMPACTION_WAITING), version(-1) {}
 
-    BaseExpansionStage status;
+    BaseCompactionStage status;
     int32_t version;
 };
 
@@ -222,18 +222,18 @@ public:
         _push_lock.unlock();
     }
 
-    // Prevent base expansion operations execute concurrently.
-    bool try_base_expansion_lock() {
-        return _base_expansion_lock.trylock() == OLAP_SUCCESS;
+    // Prevent base compaction operations execute concurrently.
+    bool try_base_compaction_lock() {
+        return _base_compaction_lock.trylock() == OLAP_SUCCESS;
     }
-    void obtain_base_expansion_lock() {
-        _base_expansion_lock.lock();
+    void obtain_base_compaction_lock() {
+        _base_compaction_lock.lock();
     }
-    void release_base_expansion_lock() {
-        _base_expansion_lock.unlock();
+    void release_base_compaction_lock() {
+        _base_compaction_lock.unlock();
     }
 
-    // Prevent cumulative expansion operations execute concurrently.
+    // Prevent cumulative compaction operations execute concurrently.
     bool try_cumulative_lock() {
         return (OLAP_SUCCESS == _cumulative_lock.trylock());
     }
@@ -379,8 +379,8 @@ public:
     }
 
     // 在使用之å‰å¯¹header加é”
-    const uint32_t get_expansion_nice_estimate() const {
-        return _header->get_expansion_nice_estimate();
+    const uint32_t get_compaction_nice_estimate() const {
+        return _header->get_compaction_nice_estimate();
     }
 
     const OLAPStatus delete_version(const Version& version) {
@@ -492,15 +492,15 @@ public:
     void clear_schema_change_request();
 
     // Following are get/set status functions.
-    // Like base-expansion, push, sync, schema-change.
-    BaseExpansionStatus base_expansion_status() {
-        return _base_expansion_status;
+    // Like base-compaction, push, sync, schema-change.
+    BaseCompactionStatus base_compaction_status() {
+        return _base_compaction_status;
     }
 
-    void set_base_expansion_status(BaseExpansionStage status, int32_t version) {
-        _base_expansion_status.status = status;
+    void set_base_compaction_status(BaseCompactionStage status, int32_t version) {
+        _base_compaction_status.status = status;
         if (version > -2) {
-            _base_expansion_status.version = version;
+            _base_compaction_status.version = version;
         }
     }
 
@@ -662,7 +662,7 @@ private:
     field_index_map_t _field_index_map;
     std::vector _field_sizes;
     // A series of status
-    BaseExpansionStatus _base_expansion_status;
+    BaseCompactionStatus _base_compaction_status;
     PushStatus _push_status;
     SyncStatus _sync_status;
     SchemaChangeStatus _schema_change_status;
@@ -670,7 +670,7 @@ private:
     RWLock _header_lock;
     MutexLock _push_lock;
     MutexLock _cumulative_lock;
-    MutexLock _base_expansion_lock;
+    MutexLock _base_compaction_lock;
     MutexLock _sync_lock;
     size_t _id;                        // uniq id, used in cache
     std::string _storage_root_path;
diff --git a/be/src/olap/push_handler.cpp b/be/src/olap/push_handler.cpp
index d35165cab4..4e2d3db009 100644
--- a/be/src/olap/push_handler.cpp
+++ b/be/src/olap/push_handler.cpp
@@ -406,6 +406,7 @@ OLAPStatus PushHandler::_convert(
             break;
         }
 
+        MemPool* mem_pool = writer->mem_pool();
         // 4. Init RowCursor
         if (OLAP_SUCCESS != (res = row.init(curr_olap_table->tablet_schema()))) {
             OLAP_LOG_WARNING("fail to init rowcursor. [res=%d]", res);
@@ -425,7 +426,7 @@ OLAPStatus PushHandler::_convert(
                     break;
                 }
 
-                res = reader->next(&row);
+                res = reader->next(&row, mem_pool);
                 if (OLAP_SUCCESS != res) {
                     OLAP_LOG_WARNING("read next row failed. [res=%d read_rows=%u]",
                                      res, num_rows);
@@ -457,6 +458,8 @@ OLAPStatus PushHandler::_convert(
                              res, curr_olap_table->full_name().c_str(), _request.version);
             break;
         }
+        _write_bytes += delta_index->data_size();
+        _write_rows += delta_index->num_rows();
 
         // 7. Convert data for schema change tables
         OLAP_LOG_TRACE("load to related tables of schema_change if possible. ");
@@ -769,7 +772,7 @@ OLAPStatus BinaryReader::finalize() {
     return OLAP_SUCCESS;
 }
 
-OLAPStatus BinaryReader::next(RowCursor* row) {
+OLAPStatus BinaryReader::next(RowCursor* row, MemPool* mem_pool) {
     OLAPStatus res = OLAP_SUCCESS;
 
     if (!_ready || NULL == row) {
@@ -789,10 +792,11 @@ OLAPStatus BinaryReader::next(RowCursor* row) {
 
     size_t p  = 0;
     for (size_t i = 0; i < schema.size(); ++i) {
-        if (true == schema[i].is_allow_null) {
+        row->set_not_null(i);
+        if (schema[i].is_allow_null) {
             bool is_null = false;
             is_null = (_row_buf[p/8] >> ((num_null_bytes * 8 - p - 1) % 8)) & 1;
-            if (true == is_null) {
+            if (is_null) {
                 row->set_null(i);
             }
             p++;
@@ -807,19 +811,17 @@ OLAPStatus BinaryReader::next(RowCursor* row) {
         if (schema[i].type == OLAP_FIELD_TYPE_VARCHAR || schema[i].type == OLAP_FIELD_TYPE_HLL) {
             // Read varchar length buffer first
             if (OLAP_SUCCESS != (res = _file->read(_row_buf + offset,
-                        sizeof(VarCharField::LengthValueType)))) {
+                        sizeof(StringLengthType)))) {
                 OLAP_LOG_WARNING("read file for one row fail. [res=%d]", res);
                 return res;
             }
 
-            // Get varchar field size 
-            VarCharField::LengthValueType* size_ptr =
-                reinterpret_cast(_row_buf + offset);
-            offset += sizeof(VarCharField::LengthValueType);
-            field_size = *size_ptr;
-            if (field_size > schema[i].length - sizeof(VarCharField::LengthValueType)) {
+            // Get varchar field size
+            field_size = *reinterpret_cast(_row_buf + offset);
+            offset += sizeof(StringLengthType);
+            if (field_size > schema[i].length - sizeof(StringLengthType)) {
                 OLAP_LOG_WARNING("invalid data length for VARCHAR! [max_len=%d real_len=%d]",
-                                 schema[i].length - sizeof(VarCharField::LengthValueType),
+                                 schema[i].length - sizeof(StringLengthType),
                                  field_size);
                 return OLAP_ERR_PUSH_INPUT_DATA_ERROR;
             }
@@ -833,10 +835,13 @@ OLAPStatus BinaryReader::next(RowCursor* row) {
             return res;
         }
 
-        if (schema[i].type == OLAP_FIELD_TYPE_VARCHAR || schema[i].type == OLAP_FIELD_TYPE_HLL) {
-            row->read_field(_row_buf + offset - sizeof(VarCharField::LengthValueType), i, sizeof(VarCharField::LengthValueType) + field_size);
+        if (schema[i].type == OLAP_FIELD_TYPE_CHAR
+                || schema[i].type == OLAP_FIELD_TYPE_VARCHAR
+                || schema[i].type == OLAP_FIELD_TYPE_HLL) {
+            StringSlice slice(_row_buf + offset, field_size);
+            row->set_field_content(i, reinterpret_cast(&slice), mem_pool);
         } else {
-            row->read_field(_row_buf + offset, i, field_size);
+            row->set_field_content(i, _row_buf + offset, mem_pool);
         }
         offset += field_size;
     }
@@ -899,7 +904,7 @@ OLAPStatus LzoBinaryReader::finalize() {
     return OLAP_SUCCESS;
 }
 
-OLAPStatus LzoBinaryReader::next(RowCursor* row) {
+OLAPStatus LzoBinaryReader::next(RowCursor* row, MemPool* mem_pool) {
     OLAPStatus res = OLAP_SUCCESS;
 
     if (!_ready || NULL == row) {
@@ -920,12 +925,12 @@ OLAPStatus LzoBinaryReader::next(RowCursor* row) {
     size_t num_null_bytes = (_table->num_null_fields() + 7) / 8;
 
     size_t p = 0;
-    row->reset_buf();
     for (size_t i = 0; i < schema.size(); ++i) {
-        if (true == schema[i].is_allow_null) {
+        row->set_not_null(i);
+        if (schema[i].is_allow_null) {
             bool is_null = false;
             is_null = (_row_buf[_next_row_start + p/8] >> ((num_null_bytes * 8 - p - 1) % 8)) & 1;
-            if (true == is_null) {
+            if (is_null) {
                 row->set_null(i);
             }
             p++;
@@ -940,14 +945,12 @@ OLAPStatus LzoBinaryReader::next(RowCursor* row) {
 
         if (schema[i].type == OLAP_FIELD_TYPE_VARCHAR || schema[i].type == OLAP_FIELD_TYPE_HLL) {
             // Get varchar field size
-            VarCharField::LengthValueType* size_ptr =
-                reinterpret_cast(_row_buf + _next_row_start + offset);
-            offset += sizeof(VarCharField::LengthValueType);
-            field_size = *size_ptr;
+            field_size = *reinterpret_cast(_row_buf + _next_row_start + offset);
+            offset += sizeof(StringLengthType);
 
-            if (field_size > schema[i].length - sizeof(VarCharField::LengthValueType)) {
+            if (field_size > schema[i].length - sizeof(StringLengthType)) {
                 OLAP_LOG_WARNING("invalid data length for VARCHAR! [max_len=%d real_len=%d]",
-                                 schema[i].length - sizeof(VarCharField::LengthValueType),
+                                 schema[i].length - sizeof(StringLengthType),
                                  field_size);
                 return OLAP_ERR_PUSH_INPUT_DATA_ERROR;
             }
@@ -955,10 +958,13 @@ OLAPStatus LzoBinaryReader::next(RowCursor* row) {
             field_size = schema[i].length;
         }
 
-        if (schema[i].type == OLAP_FIELD_TYPE_VARCHAR || schema[i].type == OLAP_FIELD_TYPE_HLL) {
-            row->read_field(_row_buf + _next_row_start + offset - sizeof(VarCharField::LengthValueType), i, sizeof(VarCharField::LengthValueType) + field_size);
+        if (schema[i].type == OLAP_FIELD_TYPE_CHAR
+                || schema[i].type == OLAP_FIELD_TYPE_VARCHAR
+                || schema[i].type == OLAP_FIELD_TYPE_HLL) {
+            StringSlice slice(_row_buf + _next_row_start + offset, field_size);
+            row->set_field_content(i, reinterpret_cast(&slice), mem_pool);
         } else {
-            row->read_field(_row_buf + _next_row_start + offset, i, field_size);
+            row->set_field_content(i, _row_buf + _next_row_start + offset, mem_pool);
         }
 
         offset += field_size;
diff --git a/be/src/olap/push_handler.h b/be/src/olap/push_handler.h
index 02b8f0c687..2314394ee8 100644
--- a/be/src/olap/push_handler.h
+++ b/be/src/olap/push_handler.h
@@ -59,7 +59,8 @@ public:
             const TPushReq& request,
             PushType push_type,
             std::vector* tablet_info_vec);
-
+    int64_t write_bytes() const { return _write_bytes; }
+    int64_t write_rows() const { return _write_rows; }
 private:
     // Validate request, mainly data version check.
     OLAPStatus _validate_request(
@@ -156,6 +157,8 @@ private:
     // lock tablet header before modify tabelt header
     bool _header_locked;
 
+    int64_t _write_bytes = 0;
+    int64_t _write_rows = 0;
     DISALLOW_COPY_AND_ASSIGN(PushHandler);
 };
 
@@ -196,7 +199,7 @@ public:
     virtual OLAPStatus init(SmartOLAPTable table, BinaryFile* file) = 0;
     virtual OLAPStatus finalize() = 0;
 
-    virtual OLAPStatus next(RowCursor* row) = 0;
+    virtual OLAPStatus next(RowCursor* row, MemPool* mem_pool) = 0;
 
     virtual bool eof() = 0;
 
@@ -233,7 +236,7 @@ public:
     virtual OLAPStatus init(SmartOLAPTable table, BinaryFile* file);
     virtual OLAPStatus finalize();
 
-    virtual OLAPStatus next(RowCursor* row);
+    virtual OLAPStatus next(RowCursor* row, MemPool* mem_pool);
 
     virtual bool eof() {
         return _curr >= _content_len;
@@ -254,7 +257,7 @@ public:
     virtual OLAPStatus init(SmartOLAPTable table, BinaryFile* file);
     virtual OLAPStatus finalize();
 
-    virtual OLAPStatus next(RowCursor* row);
+    virtual OLAPStatus next(RowCursor* row, MemPool* mem_pool);
 
     virtual bool eof() {
         return _curr >= _content_len && _row_num == 0;
diff --git a/be/src/olap/reader.cpp b/be/src/olap/reader.cpp
index 499328a68e..8bda6227aa 100644
--- a/be/src/olap/reader.cpp
+++ b/be/src/olap/reader.cpp
@@ -19,6 +19,15 @@
 #include "olap/olap_table.h"
 #include "olap/row_block.h"
 #include "olap/row_cursor.h"
+#include "util/date_func.h"
+#include "util/mem_util.hpp"
+#include "runtime/mem_tracker.h"
+#include "runtime/mem_pool.h"
+#include 
+
+#include "olap/comparison_predicate.h"
+#include "olap/in_list_predicate.h"
+#include "olap/null_predicate.h"
 
 using std::nothrow;
 using std::set;
@@ -26,120 +35,277 @@ using std::vector;
 
 namespace palo {
 
-Reader::MergeSet::~MergeSet() {
-    SAFE_DELETE(_heap);
-}
+class CollectIterator {
+public:
+    ~CollectIterator();
 
-OLAPStatus Reader::MergeSet::init(Reader* reader, bool reverse) {
-    _reader = reader;
+    // Hold reader point to get reader params, 
+    // set reverse to true if need read in reverse order.
+    OLAPStatus init(Reader* reader);
 
-    _heap = new (nothrow) heap_t(RowCursorComparator(reverse));
-    if (_heap == NULL) {
-        OLAP_LOG_FATAL("failed to malloc. [size=%ld]", sizeof(heap_t));
-        return OLAP_ERR_MALLOC_ERROR;
+    OLAPStatus add_child(IData* data, RowBlock* block);
+
+    // Get top row of the heap, NULL if reach end.
+    const RowCursor* current_row(bool* delete_flag) const {
+        if (_cur_child != nullptr) {
+            return _cur_child->current_row(delete_flag);
+        }
+        return nullptr;
     }
 
+    // Pop the top element and rebuild the heap to 
+    // get the next row cursor.
+    inline OLAPStatus next(const RowCursor** row, bool* delete_flag);
+
+    // Clear the MergeSet element and reset state.
+    void clear();
+
+private:
+    class ChildCtx {
+    public:
+        ChildCtx(IData* data, RowBlock* block, DeleteHandler* delete_handler)
+                : _data(data),
+                _is_delete(data->delete_flag()),
+                _delete_handler(delete_handler),
+                _row_block(block) {
+        }
+
+        OLAPStatus init() {
+            auto res = _row_cursor.init(_data->olap_index()->table()->tablet_schema());
+            if (res != OLAP_SUCCESS) {
+                LOG(WARNING) << "failed to init row cursor, res=" << res;
+                return res;
+            }
+            res = _refresh_current_row();
+            if (res != OLAP_SUCCESS) {
+                return res;
+            }
+            return OLAP_SUCCESS;
+        }
+
+        const RowCursor* current_row(bool* delete_flag) const {
+            *delete_flag = _is_delete;
+            return _current_row;
+        }
+
+        const RowCursor* current_row() const {
+            return _current_row;
+        }
+
+        int32_t version() const {
+            return _data->version().second;
+        }
+
+        OLAPStatus next(const RowCursor** row, bool* delete_flag) {
+            _row_block->pos_inc();
+            auto res = _refresh_current_row();
+            *row = _current_row;
+            *delete_flag = _is_delete;
+            return res;
+        }
+
+        int64_t num_filtered_rows() const {
+            return _num_filtered_rows;
+        }
+
+    private:
+        // refresh _current_row, 
+        OLAPStatus _refresh_current_row() {
+            DCHECK(_row_block != nullptr);
+            do {
+                if (_row_block->has_remaining()) {
+                    size_t pos = _row_block->pos();
+                    _row_block->get_row(pos, &_row_cursor);
+                    if (_row_block->block_status() == DEL_PARTIAL_SATISFIED &&
+                        _delete_handler->is_filter_data(_data->version().second, _row_cursor)) {
+                        _num_filtered_rows++;
+                        _row_block->pos_inc();
+                        continue;
+                    }
+                    _current_row = &_row_cursor;
+                    return OLAP_SUCCESS;
+                } else {
+                    auto res = _data->get_next_block(&_row_block);
+                    if (res != OLAP_SUCCESS) {
+                        _current_row = nullptr;
+                        return res;
+                    }
+                }
+            } while (_row_block != nullptr);
+            _current_row = nullptr;
+            return OLAP_ERR_DATA_EOF;
+        }
+
+        IData* _data = nullptr;
+        const RowCursor* _current_row = nullptr;
+        bool _is_delete = false;
+        int64_t _num_filtered_rows = 0;
+        DeleteHandler* _delete_handler;
+
+        RowCursor _row_cursor;
+        RowBlock* _row_block = nullptr;
+    };
+
+    // Compare row cursors between multiple merge elements,
+    // if row cursors equal, compare data version.
+    class ChildCtxComparator {
+    public:
+        bool operator()(const ChildCtx* a, const ChildCtx* b);
+    };
+
+    inline OLAPStatus _merge_next(const RowCursor** row, bool* delete_flag);
+    inline OLAPStatus _normal_next(const RowCursor** row, bool* delete_flag);
+
+    // If _merge is true, result row must be ordered
+    bool _merge = true;
+
+    typedef std::priority_queue, ChildCtxComparator> MergeHeap;
+    MergeHeap _heap;
+
+    std::vector _children;
+    ChildCtx* _cur_child = nullptr;
+    // Used when _merge is false
+    int _child_idx = 0;
+
+    // Hold reader point to access read params, such as fetch conditions.
+    Reader* _reader = nullptr;
+};
+
+
+CollectIterator::~CollectIterator() {
+    for (auto child : _children) {
+        delete child;
+    }
+}
+
+OLAPStatus CollectIterator::init(Reader* reader) {
+    _reader = reader;
+    // when aggregate is enabled or key_type is DUP_KEYS, we don't merge
+    // multiple data to aggregate for performance in user fetch
+    if (_reader->_reader_type == READER_FETCH &&
+            (_reader->_aggregation ||
+             _reader->_olap_table->keys_type() == KeysType::DUP_KEYS)) {
+        _merge = false;
+    }
     return OLAP_SUCCESS;
 }
 
-bool Reader::MergeSet::attach(const MergeElement& merge_element, const RowCursor* row) {
-    // Use data file's end_version as data's version
-    int32_t data_version = merge_element->version().second;
-    do {
-        if (row == NULL) {
-            if (!merge_element->eof()) {
-                // Return error if merge_element isn't reach end, but row equal NULL.
-                OLAP_LOG_WARNING("internal error with IData.");
-                return false;
-            } else {
-                _reader->_filted_rows += merge_element->get_filted_rows();
-            }
-        } else {
-            _reader->_scan_rows++;
-            if (merge_element->data_file_type() == OLAP_DATA_FILE) {
-                if (_reader->_delete_handler.is_filter_data(data_version, *row)) {
-                    _reader->_filted_rows++;
-                    row = merge_element->get_next_row();
-                    continue;
-                }
-            }
-            _heap->push(merge_element);
-        }
-
-        break;
-    } while (true);
-
-    return true;
-}
-
-const RowCursor* Reader::MergeSet::curr(bool* delete_flag) {
-    if (_heap->size() > 0) {
-        *delete_flag = _heap->top()->delete_flag();
-        return _heap->top()->get_current_row();
+OLAPStatus CollectIterator::add_child(IData* data, RowBlock* block) {
+    std::unique_ptr child(new ChildCtx(data, block, &_reader->_delete_handler));
+    auto res = child->init();
+    if (res != OLAP_SUCCESS) {
+        LOG(WARNING) << "failed to initial reader, res=" << res;
+        return res;
+    }
+    if (child->current_row() == nullptr) {
+        _reader->_stats.rows_del_filtered += child->num_filtered_rows();
+        return OLAP_SUCCESS;
+    }
+    ChildCtx* child_ptr = child.release();
+    _children.push_back(child_ptr);
+    if (_merge) {
+        _heap.push(child_ptr);
+        _cur_child = _heap.top();
     } else {
-        return NULL;
-    }
-}
-
-bool Reader::MergeSet::next(const RowCursor** element, bool* delete_flag) {
-    if (!_pop_from_heap()) {
-        return false;
-    }
-
-    (element == NULL || (*element = curr(delete_flag)));
-    return true;
-}
-
-bool Reader::MergeSet::_pop_from_heap() {
-    MergeElement merge_element = _heap->top();
-    const RowCursor* row = merge_element->get_next_row();
-
-    // when Reader is used for fetch,
-    // Reader will read deltas one by one without merge sort in DUP_KEYS keys type,
-    // so we don't need to use pop and attach to adjust the _heap.
-    if (_reader->_reader_type == READER_FETCH
-            && _reader->_olap_table->keys_type() == KeysType::DUP_KEYS && row != NULL) {
-        _reader->_scan_rows++;
-        if (merge_element->data_file_type() == OLAP_DATA_FILE) {
-            int32_t data_version = merge_element->version().second;
-            if (_reader->_delete_handler.is_filter_data(data_version, *row)) {
-                _reader->_filted_rows++;
-                return _pop_from_heap();
-            }
-        }
-        return true;
-    }
-
-    _heap->pop();
-    return attach(merge_element, row);
-}
-
-bool Reader::MergeSet::clear() {
-    if (_heap != NULL) {
-        while (_heap->size() > 0) {
-            _heap->pop();
+        if (_cur_child == nullptr) {
+            _cur_child = _children[_child_idx];
         }
     }
-    return true;
+    return OLAP_SUCCESS;
 }
 
-bool Reader::MergeSet::RowCursorComparator::operator()(
-        const MergeElement &a,
-        const MergeElement &b) {
+inline OLAPStatus CollectIterator::next(const RowCursor** row, bool* delete_flag) {
+    DCHECK(_cur_child != nullptr);
+    if (_merge) {
+        return _merge_next(row, delete_flag);
+    } else {
+        return _normal_next(row, delete_flag);
+    }
+}
+
+inline OLAPStatus CollectIterator::_merge_next(const RowCursor** row, bool* delete_flag) {
+    _heap.pop();
+    auto res = _cur_child->next(row, delete_flag);
+    if (res == OLAP_SUCCESS) {
+        _heap.push(_cur_child);
+        _cur_child = _heap.top();
+    } else if (res == OLAP_ERR_DATA_EOF) {
+        _reader->_stats.rows_del_filtered += _cur_child->num_filtered_rows();
+        if (_heap.size() > 0) {
+            _cur_child = _heap.top();
+        } else {
+            _cur_child = nullptr;
+            return OLAP_ERR_DATA_EOF;
+        }
+    } else {
+        LOG(WARNING) << "failed to get next from child, res=" << res;
+        return res;
+    }
+    *row = _cur_child->current_row(delete_flag);
+    return OLAP_SUCCESS;
+}
+
+inline OLAPStatus CollectIterator::_normal_next(const RowCursor** row, bool* delete_flag) {
+    auto res = _cur_child->next(row, delete_flag);
+    if (LIKELY(res == OLAP_SUCCESS)) {
+        return OLAP_SUCCESS;
+    } else if (res == OLAP_ERR_DATA_EOF) {
+        // this child has been read, to read next
+        _reader->_stats.rows_del_filtered += _cur_child->num_filtered_rows();
+        _child_idx++;
+        if (_child_idx < _children.size()) {
+            _cur_child = _children[_child_idx]; 
+            *row = _cur_child->current_row(delete_flag);
+            return OLAP_SUCCESS;
+        } else {
+            _cur_child = nullptr;
+            return OLAP_ERR_DATA_EOF;
+        }
+    } else {
+        LOG(WARNING) << "failed to get next from child, res=" << res;
+        return res;
+    }
+}
+
+bool CollectIterator::ChildCtxComparator::operator()(const ChildCtx* a, const ChildCtx* b) {
     // First compare row cursor.
-    const RowCursor* first = a->get_current_row();
-    const RowCursor* second = b->get_current_row();
+    const RowCursor* first = a->current_row();
+    const RowCursor* second = b->current_row();
     int cmp_res = first->full_key_cmp(*second);
     if (cmp_res != 0) {
-        if (_reverse) {
-            return cmp_res < 0;
-        } else {
-            return cmp_res > 0;
-        }
+        return cmp_res > 0;
     }
-
     // if row cursors equal, compare data version.
-    return a->version().second > b->version().second;
+    return a->version() > b->version();
+}
+
+void CollectIterator::clear() {
+    while (_heap.size() > 0) {
+        _heap.pop();
+    }
+    for (auto child : _children) {
+        delete child;
+    }
+    // _children.swap(std::vector());
+    _children.clear();
+    _cur_child = nullptr;
+    _child_idx = 0;
+}
+
+Reader::Reader()
+        : _next_key_index(0),
+        _aggregation(false),
+        _version_locked(false),
+        _reader_type(READER_FETCH),
+        _next_delete_flag(false),
+        _next_key(NULL),
+        _merged_rows(0) {
+    _tracker.reset(new MemTracker(-1));
+    _predicate_mem_pool.reset(new MemPool(_tracker.get()));
+}
+
+Reader::~Reader() {
+    close();
 }
 
 OLAPStatus Reader::init(const ReaderParams& read_params) {
@@ -157,32 +323,103 @@ OLAPStatus Reader::init(const ReaderParams& read_params) {
         return res;
     }
 
+    for (auto i_data: _data_sources) {
+        i_data->set_stats(&_stats);
+    }
+
     bool eof = false;
     if (OLAP_SUCCESS != (res = _attach_data_to_merge_set(true, &eof))) {
         OLAP_LOG_WARNING("failed to attaching data to merge set. [res=%d]", res);
         return res;
     }
 
-    for (auto i_data: _data_sources) {
-        i_data->set_profile(read_params.profile);
+    switch (_olap_table->keys_type()) {
+    case KeysType::DUP_KEYS:
+        _next_row_func = &Reader::_dup_key_next_row;
+        break;
+    case KeysType::UNIQUE_KEYS:
+        _next_row_func = &Reader::_unique_key_next_row;
+        break;
+    case KeysType::AGG_KEYS:
+        _next_row_func = &Reader::_agg_key_next_row;
+        break;
+    default:
+        break;
     }
+    DCHECK(_next_row_func != nullptr) << "No next row function for type:"
+        << _olap_table->keys_type();
 
-    _is_inited = true;
     return OLAP_SUCCESS;
 }
 
-OLAPStatus Reader::next_row_with_aggregation(
-        RowCursor* row_cursor,
-        int64_t* raw_rows_read,
-        bool* eof) {
-    OLAPStatus res = OLAP_SUCCESS;
-    bool cur_delete_flag = false;
+OLAPStatus Reader::_dup_key_next_row(RowCursor* row_cursor, bool* eof) {
+    *eof = false;
+    if (_next_key == nullptr) {
+        auto res = _attach_data_to_merge_set(false, eof);
+        if (OLAP_SUCCESS != res) {
+            OLAP_LOG_WARNING("failed to attach data to merge set.");
+            return res;
+        }
+        if (*eof) {
+            return OLAP_SUCCESS;
+        }
+    }
+    row_cursor->copy_without_pool(*_next_key);
+    auto res = _collect_iter->next(&_next_key, &_next_delete_flag);
+    if (res != OLAP_SUCCESS) {
+        if (res != OLAP_ERR_DATA_EOF) {
+            return res;
+        }
+    }
+    return OLAP_SUCCESS;
+}
+
+OLAPStatus Reader::_agg_key_next_row(RowCursor* row_cursor, bool* eof) {
     *eof = false;
 
+    if (NULL == _next_key) {
+        auto res = _attach_data_to_merge_set(false, eof);
+        if (OLAP_SUCCESS != res) {
+            OLAP_LOG_WARNING("failed to attach data to merge set.");
+            return res;
+        }
+        if (*eof) {
+            return OLAP_SUCCESS;
+        }
+    }
+    row_cursor->agg_init(*_next_key);
+    int64_t merged_count = 0;
+    do {
+        auto res = _collect_iter->next(&_next_key, &_next_delete_flag);
+        if (res != OLAP_SUCCESS) {
+            if (res != OLAP_ERR_DATA_EOF) {
+                return res;
+            }
+            break;
+        }
+
+        if (_aggregation && merged_count > config::palo_scanner_row_num) {
+            break;
+        }
+        // break while can NOT doing aggregation
+        if (!RowCursor::equal(_key_cids, row_cursor, _next_key)) {
+            break;
+        }
+
+        RowCursor::aggregate(_value_cids, row_cursor, _next_key);
+        ++merged_count;
+    } while (true);
+    _merged_rows += merged_count;
+    row_cursor->finalize_one_merge(_value_cids);
+    return OLAP_SUCCESS;
+}
+
+OLAPStatus Reader::_unique_key_next_row(RowCursor* row_cursor, bool* eof) {
+    *eof = false;
+    bool cur_delete_flag = false;
     do {
         if (NULL == _next_key) {
-            ++_current_key_index;
-            res = _attach_data_to_merge_set(false, eof);
+            auto res = _attach_data_to_merge_set(false, eof);
             if (OLAP_SUCCESS != res) {
                 OLAP_LOG_WARNING("failed to attach data to merge set.");
                 return res;
@@ -193,25 +430,15 @@ OLAPStatus Reader::next_row_with_aggregation(
         }
     
         cur_delete_flag = _next_delete_flag;
-        res = row_cursor->copy(*_next_key);
-    
-        if (OLAP_SUCCESS != res) {
-            OLAP_LOG_WARNING("failed to copy row_cursor. [row_cursor='%s']",
-                    _next_key->to_string().c_str());
-            return res;
-        }
-        ++(*raw_rows_read);
-    
+        row_cursor->agg_init(*_next_key);
+
         int64_t merged_count = 0;
         while (NULL != _next_key) {
-            if (!_merge_set.next(&_next_key, &_next_delete_flag)) {
-                OLAP_LOG_WARNING("internal error with IData.");
-                res = OLAP_ERR_READER_READING_ERROR;
-                break;
-            }
-            
-            if (NULL == _next_key) {
-                row_cursor->finalize_one_merge();
+            auto res = _collect_iter->next(&_next_key, &_next_delete_flag);
+            if (res != OLAP_SUCCESS) {
+                if (res != OLAP_ERR_DATA_EOF) {
+                    return res;
+                }
                 break;
             }
     
@@ -219,72 +446,65 @@ OLAPStatus Reader::next_row_with_aggregation(
             //   1. DUP_KEYS keys type has no semantic to aggregate,
             //   2. to make cost of  each scan round reasonable, we will control merged_count.
             if (_olap_table->keys_type() == KeysType::DUP_KEYS
-                    || (_aggregation && merged_count > config::palo_scanner_row_num)) {
-               row_cursor->finalize_one_merge(); 
-               break;
-            }
-    
-            // break while can NOT doing aggregation
-            if (!row_cursor->equal(*_next_key)) {
-               row_cursor->finalize_one_merge(); 
-               break;
-            }
-    
-            cur_delete_flag = _next_delete_flag;
-            res = row_cursor->aggregate(*_next_key);
-            if (OLAP_SUCCESS != res) {
-                OLAP_LOG_WARNING("failed to aggregate row cursor.[base_row='%s', new_rows='%s']",
-                        row_cursor->to_string().c_str(), _next_key->to_string().c_str());
+                || (_aggregation && merged_count > config::palo_scanner_row_num)) {
+                row_cursor->finalize_one_merge(_value_cids);
                 break;
             }
-    
+            // break while can NOT doing aggregation
+            if (!RowCursor::equal(_key_cids, row_cursor, _next_key)) {
+                row_cursor->finalize_one_merge(_value_cids);
+                break;
+            }
+
+            cur_delete_flag = _next_delete_flag;
+            RowCursor::aggregate(_value_cids, row_cursor, _next_key);
             ++merged_count;
         }
     
-        if (res == OLAP_SUCCESS) {
-            _merged_rows += merged_count;
-            *raw_rows_read += merged_count;
+        _merged_rows += merged_count;
+    
+        if (!cur_delete_flag) {
+            return OLAP_SUCCESS;
         }
     
-        if (res != OLAP_SUCCESS || !cur_delete_flag) {
-            return res;
-        }
-    
-        ++_filted_rows;
+        _stats.rows_del_filtered++;
     } while (cur_delete_flag);
 
-    return res;
+    return OLAP_SUCCESS;
 }
 
 void Reader::close() {
-    OLAP_LOG_DEBUG("scan rows:%lu, filted rows:%lu, merged rows:%lu",
-                   _scan_rows, _filted_rows, _merged_rows);
+    OLAP_LOG_DEBUG("merged rows:%lu", _merged_rows);
     _conditions.finalize();
     _delete_handler.finalize();
-    if (!_is_set_data_sources) {
-        _olap_table->release_data_sources(&_data_sources);
+    _olap_table->release_data_sources(&_own_data_sources);
+
+    for (auto pred : _col_predicates) {
+        delete pred;
     }
+
+    delete _collect_iter;
 }
 
 OLAPStatus Reader::_acquire_data_sources(const ReaderParams& read_params) {
-    _data_sources.clear();
+    const std::vector* data_sources;
     if (read_params.reader_type == READER_ALTER_TABLE
-            || read_params.reader_type == READER_BASE_EXPANSION
-            || read_params.reader_type == READER_CUMULATIVE_EXPANSION) {
-        _data_sources = read_params.olap_data_arr;
-        _is_set_data_sources = true;
+            || read_params.reader_type == READER_BASE_COMPACTION
+            || read_params.reader_type == READER_CUMULATIVE_COMPACTION) {
+        data_sources = &read_params.olap_data_arr;
     } else {
         _olap_table->obtain_header_rdlock();
-        _olap_table->acquire_data_sources(_version, &_data_sources);
+        _olap_table->acquire_data_sources(_version, &_own_data_sources);
         _olap_table->release_header_lock();
 
-        if (_data_sources.size() < 1) {
+        if (_own_data_sources.size() < 1) {
             OLAP_LOG_WARNING("fail to acquire data sources. [table_name='%s' version=%d-%d]",
                              _olap_table->full_name().c_str(),
                              _version.first,
                              _version.second);
             return OLAP_ERR_VERSION_NOT_EXIST;
         }
+        data_sources = &_own_data_sources;
     }
     
     // do not use index stream cache when be/ce/alter/checksum,
@@ -294,16 +514,42 @@ OLAPStatus Reader::_acquire_data_sources(const ReaderParams& read_params) {
         is_using_cache = false;
     }
 
-    for (auto i_data: _data_sources) {
-        i_data->set_conjuncts(_query_conjunct_ctxs, NULL);
+    for (auto i_data: *data_sources) {
+        // skip empty version
+        if (i_data->empty()) {
+            continue;
+        }
         i_data->set_delete_handler(_delete_handler);
         i_data->set_read_params(_return_columns,
                                 _load_bf_columns,
                                 _conditions,
+                                _col_predicates,
                                 _keys_param.start_keys,
                                 _keys_param.end_keys,
                                 is_using_cache,
                                 read_params.runtime_state);
+        if (i_data->delta_pruning_filter()) {
+            OLAP_LOG_DEBUG("filter delta in query in condition: %d, %d",
+                           i_data->version().first, i_data->version().second);
+            _stats.rows_stats_filtered += i_data->num_rows();
+            continue;
+        }
+        int ret = i_data->delete_pruning_filter();
+        if (ret == DEL_SATISFIED) {
+            OLAP_LOG_DEBUG("filter delta in query: %d, %d",
+                           i_data->version().first, i_data->version().second);
+            _stats.rows_del_filtered += i_data->num_rows();
+            continue;
+        } else if (ret == DEL_PARTIAL_SATISFIED) {
+            OLAP_LOG_DEBUG("filter delta partially in query: %d, %d",
+                           i_data->version().first, i_data->version().second);
+            i_data->set_delete_status(DEL_PARTIAL_SATISFIED);
+        } else {
+            OLAP_LOG_DEBUG("not filter delta in query: %d, %d",
+                           i_data->version().first, i_data->version().second);
+            i_data->set_delete_status(DEL_NOT_SATISFIED);
+        }
+        _data_sources.push_back(i_data);
     }
 
     return OLAP_SUCCESS;
@@ -346,11 +592,8 @@ OLAPStatus Reader::_init_params(const ReaderParams& read_params) {
         return res;
     }
 
-    res = _merge_set.init(this, false);
-    if (res != OLAP_SUCCESS) {
-        OLAP_LOG_WARNING("fail to init merge set. [res=%d]", res);
-        return res;
-    }
+    _collect_iter = new CollectIterator();
+    _collect_iter->init(this);
 
     return res;
 }
@@ -370,19 +613,40 @@ OLAPStatus Reader::_init_return_columns(const ReaderParams& read_params) {
                 }
             }
         }
+        for (auto id : read_params.return_columns) {
+            if (_olap_table->tablet_schema()[id].is_key) {
+                _key_cids.push_back(id);
+            } else {
+                _value_cids.push_back(id);
+            }
+        }
     } else if (read_params.return_columns.size() == 0) {
         for (size_t i = 0; i < _olap_table->tablet_schema().size(); ++i) {
             _return_columns.push_back(i);
+            if (_olap_table->tablet_schema()[i].is_key) {
+                _key_cids.push_back(i);
+            } else {
+                _value_cids.push_back(i);
+            }
         }
         OLAP_LOG_DEBUG("return column is empty, using full column as defaut.");
     } else if (read_params.reader_type == READER_CHECKSUM) {
-        // do nothing
+        _return_columns = read_params.return_columns;
+        for (auto id : read_params.return_columns) {
+            if (_olap_table->tablet_schema()[id].is_key) {
+                _key_cids.push_back(id);
+            } else {
+                _value_cids.push_back(id);
+            }
+        }
     } else {
         OLAP_LOG_WARNING("fail to init return columns. [reader_type=%d return_columns_size=%u]",
                          read_params.reader_type, read_params.return_columns.size());
         return OLAP_ERR_INPUT_PARAMETER_ERROR;
     }
 
+    std::sort(_key_cids.begin(), _key_cids.end(), std::greater());
+
     return OLAP_SUCCESS;
 }
 
@@ -395,19 +659,20 @@ OLAPStatus Reader::_attach_data_to_merge_set(bool first, bool *eof) {
         RowCursor *end_key = NULL;
         bool find_last_row = false;
         bool end_key_find_last_row = false;
-
-        _merge_set.clear();
+        _collect_iter->clear();
 
         if (_keys_param.start_keys.size() > 0) {
-            if (_current_key_index >= _keys_param.start_keys.size()) {
+            if (_next_key_index >= _keys_param.start_keys.size()) {
                 *eof = true;
                 OLAP_LOG_DEBUG("can NOT attach while start_key has been used.");
                 return res;
             }
-            start_key = _keys_param.start_keys[_current_key_index];
+            auto cur_key_index = _next_key_index++;
+            
+            start_key = _keys_param.start_keys[cur_key_index];
 
             if (0 != _keys_param.end_keys.size()) {
-                end_key = _keys_param.end_keys[_current_key_index];
+                end_key = _keys_param.end_keys[cur_key_index];
                 if (0 == _keys_param.end_range.compare("lt")) {
                     end_key_find_last_row = false;
                 } else if (0 == _keys_param.end_range.compare("le")) {
@@ -460,84 +725,29 @@ OLAPStatus Reader::_attach_data_to_merge_set(bool first, bool *eof) {
             return res;
         }
 
-        for (std::vector::iterator it = _data_sources.begin();
-                it != _data_sources.end(); ++it) {
-            const RowCursor *start_row_cursor = NULL;
-
-            if (OLAP_LIKELY(start_key != NULL)) {
-                if ((*it)->delta_pruning_filter()) {
-                    OLAP_LOG_DEBUG("filter delta in query in condition: %d, %d",
-                                   (*it)->version().first, (*it)->version().second);
-                    _filted_rows += (*it)->num_rows();
-                    continue;
+        for (auto data : _data_sources) {
+            RowBlock* block = nullptr;
+            auto res = data->prepare_block_read(
+                start_key, find_last_row, end_key, end_key_find_last_row, &block);
+            if (res == OLAP_SUCCESS) {
+                res = _collect_iter->add_child(data, block);
+                if (res != OLAP_SUCCESS && res != OLAP_ERR_DATA_EOF) {
+                    LOG(WARNING) << "failed to add child to iterator";
+                    return res;
                 }
-
-                int ret = (*it)->delete_pruning_filter();
-                if (DEL_SATISFIED == ret) {
-                    OLAP_LOG_DEBUG("filter delta in query: %d, %d",
-                                   (*it)->version().first, (*it)->version().second);
-                    _filted_rows += (*it)->num_rows();
-                    continue;
-                } else if (DEL_PARTIAL_SATISFIED == ret) {
-                    OLAP_LOG_DEBUG("filter delta partially in query: %d, %d",
-                                   (*it)->version().first, (*it)->version().second);
-                    (*it)->set_delete_status(DEL_PARTIAL_SATISFIED);
-                } else {
-                    OLAP_LOG_DEBUG("not filter delta in query: %d, %d",
-                                   (*it)->version().first, (*it)->version().second);
-                    (*it)->set_delete_status(DEL_NOT_SATISFIED);
-                }
-
-                (*it)->set_end_key(end_key, end_key_find_last_row);
-                start_row_cursor = (*it)->find_row(*start_key, find_last_row, false);
-            } else {
-                if ((*it)->empty()) {
-                    continue;
-                }
-
-                //BE procedure will go into this branch, which key params is empty
-                int ret = (*it)->delete_pruning_filter();
-                if (DEL_SATISFIED == ret) {
-                    OLAP_LOG_DEBUG("filter delta in query: %d, %d",
-                                   (*it)->version().first, (*it)->version().second);
-                    _filted_rows += (*it)->num_rows();
-                    continue;
-                } else if (DEL_PARTIAL_SATISFIED == ret) {
-                    OLAP_LOG_DEBUG("filter delta partially in query: %d, %d",
-                                   (*it)->version().first, (*it)->version().second);
-                    (*it)->set_delete_status(DEL_PARTIAL_SATISFIED);
-                } else {
-                    OLAP_LOG_DEBUG("not filter delta in query: %d, %d",
-                                   (*it)->version().first, (*it)->version().second);
-                    (*it)->set_delete_status(DEL_NOT_SATISFIED);
-                }
-
-                start_row_cursor = (*it)->get_first_row();
-            }
-
-            if ((*it)->eof()) {
-                OLAP_LOG_DEBUG("got EOF while setting start_row_cursor. "
-                               "[version=%d-%d read_params='%s']",
-                               (*it)->version().first, (*it)->version().second,
-                               _keys_param.to_string().c_str());
+            } else if (res == OLAP_ERR_DATA_EOF) {
                 continue;
+            } else {
+                LOG(WARNING) << "prepare block failed, res=" << res;
+                return res;
             }
-
-            if (!start_row_cursor) {
-                OLAP_LOG_WARNING("failed to set start_row_cursor. [read_params='%s']",
-                        _keys_param.to_string().c_str());
-                return OLAP_ERR_READER_GET_ITERATOR_ERROR;
-            }
-
-            _merge_set.attach(*it, start_row_cursor);
         }
 
-        _next_key = _merge_set.curr(&_next_delete_flag);
+        _next_key = _collect_iter->current_row(&_next_delete_flag);
         if (_next_key != NULL) {
             break;
         }
 
-        ++_current_key_index;
         first = false;
     } while (NULL == _next_key);
 
@@ -547,7 +757,7 @@ OLAPStatus Reader::_attach_data_to_merge_set(bool first, bool *eof) {
 OLAPStatus Reader::_init_keys_param(const ReaderParams& read_params) {
     OLAPStatus res = OLAP_SUCCESS;
 
-    _current_key_index = 0;
+    _next_key_index = 0;
 
     if (read_params.start_key.size() == 0) {
         return OLAP_SUCCESS;
@@ -564,7 +774,7 @@ OLAPStatus Reader::_init_keys_param(const ReaderParams& read_params) {
             return OLAP_ERR_MALLOC_ERROR;
         }
         
-        res = _keys_param.start_keys[i]->init_keys(_olap_table->tablet_schema(),
+        res = _keys_param.start_keys[i]->init_scan_key(_olap_table->tablet_schema(),
                                               read_params.start_key[i].key);
         if (res != OLAP_SUCCESS) {
             OLAP_LOG_WARNING("fail to init row cursor. [res=%d]", res);
@@ -592,13 +802,8 @@ OLAPStatus Reader::_init_keys_param(const ReaderParams& read_params) {
             return OLAP_ERR_MALLOC_ERROR;
         }
         
-        res = _keys_param.end_keys[i]->init_keys(_olap_table->tablet_schema(),
+        res = _keys_param.end_keys[i]->init_scan_key(_olap_table->tablet_schema(),
                                             read_params.end_key[i].key);
-        /*
-        for (size_t j = 0; j < read_params.end_key[i].key.size(); ++j) {
-            _keys_param.end_keys[i]->set_null(j);
-        }
-        */
         if (res != OLAP_SUCCESS) {
             OLAP_LOG_WARNING("fail to init row cursor. [res=%d]", res);
             return res;
@@ -622,20 +827,257 @@ OLAPStatus Reader::_init_conditions_param(const ReaderParams& read_params) {
     _conditions.set_table(_olap_table);
     for (int i = 0; i < read_params.conditions.size(); ++i) {
         _conditions.append_condition(read_params.conditions[i]);
+        ColumnPredicate* predicate = _parse_to_predicate(read_params.conditions[i]);
+        if (predicate != NULL) {
+            _col_predicates.push_back(predicate);
+        }
     }
-    _query_conjunct_ctxs = read_params.conjunct_ctxs;
 
     return res;
 }
 
+#define COMPARISON_PREDICATE_CONDITION_VALUE(NAME, PREDICATE) \
+ColumnPredicate* Reader::_new_##NAME##_pred(FieldInfo& fi, int index, const std::string& cond) { \
+    ColumnPredicate* predicate = NULL; \
+    switch (fi.type) { \
+        case OLAP_FIELD_TYPE_TINYINT: { \
+            std::stringstream ss(cond); \
+            int32_t value = 0; \
+            ss >> value; \
+            predicate = new PREDICATE(index, value); \
+            break; \
+        } \
+        case OLAP_FIELD_TYPE_SMALLINT: { \
+            std::stringstream ss(cond); \
+            int16_t value = 0; \
+            ss >> value; \
+            predicate = new PREDICATE(index, value); \
+            break; \
+        } \
+        case OLAP_FIELD_TYPE_INT: { \
+            std::stringstream ss(cond); \
+            int32_t value = 0; \
+            ss >> value; \
+            predicate = new PREDICATE(index, value); \
+            break; \
+        } \
+        case OLAP_FIELD_TYPE_BIGINT: { \
+            std::stringstream ss(cond); \
+            int64_t value = 0; \
+            ss >> value; \
+            predicate = new PREDICATE(index, value); \
+            break; \
+        } \
+        case OLAP_FIELD_TYPE_LARGEINT: { \
+            std::stringstream ss(cond); \
+            int128_t value = 0; \
+            ss >> value; \
+            predicate = new PREDICATE(index, value); \
+            break; \
+        } \
+        case OLAP_FIELD_TYPE_DECIMAL: { \
+            decimal12_t value(0, 0); \
+            value.from_string(cond); \
+            predicate = new PREDICATE(index, value); \
+            break; \
+        } \
+        case OLAP_FIELD_TYPE_CHAR: {\
+            StringValue value; \
+            size_t length = std::max(static_cast(fi.length), cond.length());\
+            char* buffer = reinterpret_cast(_predicate_mem_pool->allocate(length)); \
+            memset(buffer, 0, length); \
+            memory_copy(buffer, cond.c_str(), cond.length()); \
+            value.len = length; \
+            value.ptr = buffer; \
+            predicate = new PREDICATE(index, value); \
+            break; \
+        } \
+        case OLAP_FIELD_TYPE_VARCHAR: { \
+            StringValue value; \
+            int32_t length = cond.length(); \
+            char* buffer = reinterpret_cast(_predicate_mem_pool->allocate(length)); \
+            memory_copy(buffer, cond.c_str(), length); \
+            value.len = length; \
+            value.ptr = buffer; \
+            predicate = new PREDICATE(index, value); \
+            break; \
+        } \
+        case OLAP_FIELD_TYPE_DATE: { \
+            uint24_t value = timestamp_from_date(cond); \
+            predicate = new PREDICATE(index, value); \
+            break; \
+        } \
+        case OLAP_FIELD_TYPE_DATETIME: { \
+            uint64_t value = timestamp_from_datetime(cond); \
+            predicate = new PREDICATE(index, value); \
+            break; \
+        } \
+        default: break; \
+    } \
+ \
+    return predicate; \
+} \
+
+COMPARISON_PREDICATE_CONDITION_VALUE(eq, EqualPredicate)
+COMPARISON_PREDICATE_CONDITION_VALUE(ne, NotEqualPredicate)
+COMPARISON_PREDICATE_CONDITION_VALUE(lt, LessPredicate)
+COMPARISON_PREDICATE_CONDITION_VALUE(le, LessEqualPredicate)
+COMPARISON_PREDICATE_CONDITION_VALUE(gt, GreaterPredicate)
+COMPARISON_PREDICATE_CONDITION_VALUE(ge, GreaterEqualPredicate)
+
+ColumnPredicate* Reader::_parse_to_predicate(const TCondition& condition) {
+    // TODO: not equal and not in predicate is not pushed down
+    int index = _olap_table->get_field_index(condition.column_name);
+    FieldInfo fi = _olap_table->tablet_schema()[index];
+    ColumnPredicate* predicate = NULL;
+    if (condition.condition_op == "*="
+            && condition.condition_values.size() == 1) {
+        predicate = _new_eq_pred(fi, index, condition.condition_values[0]);
+    } else if (condition.condition_op == "<<") {
+        predicate = _new_lt_pred(fi, index, condition.condition_values[0]);
+    } else if (condition.condition_op == "<=") {
+        predicate = _new_le_pred(fi, index, condition.condition_values[0]);
+    } else if (condition.condition_op == ">>") {
+        predicate = _new_gt_pred(fi, index, condition.condition_values[0]);
+    } else if (condition.condition_op == ">=") {
+        predicate = _new_ge_pred(fi, index, condition.condition_values[0]);
+    } else if (condition.condition_op == "*="
+            && condition.condition_values.size() > 1) {
+        switch (fi.type) {
+            case OLAP_FIELD_TYPE_TINYINT: {
+                std::set values;
+                for (auto& cond_val : condition.condition_values) {
+                    int32_t value = 0;
+                    std::stringstream ss(cond_val);
+                    ss >> value;
+                    values.insert(value);
+                }
+                predicate = new InListPredicate(index, std::move(values));
+                break;
+            }
+            case OLAP_FIELD_TYPE_SMALLINT: {
+                std::set values;
+                for (auto& cond_val : condition.condition_values) {
+                    int16_t value = 0;
+                    std::stringstream ss(cond_val);
+                    ss >> value;
+                    values.insert(value);
+                }
+                predicate = new InListPredicate(index, std::move(values));
+                break;
+            }
+            case OLAP_FIELD_TYPE_INT: {
+                std::set values;
+                for (auto& cond_val : condition.condition_values) {
+                    int32_t value = 0;
+                    std::stringstream ss(cond_val);
+                    ss >> value;
+                    values.insert(value);
+                }
+                predicate = new InListPredicate(index, std::move(values));
+                break;
+            }
+            case OLAP_FIELD_TYPE_BIGINT: {
+                std::set values;
+                for (auto& cond_val : condition.condition_values) {
+                    int64_t value = 0;
+                    std::stringstream ss(cond_val);
+                    ss >> value;
+                    values.insert(value);
+                }
+                predicate = new InListPredicate(index, std::move(values));
+                break;
+            }
+            case OLAP_FIELD_TYPE_LARGEINT: {
+                std::set values;
+                for (auto& cond_val : condition.condition_values) {
+                    int128_t value = 0;
+                    std::stringstream ss(cond_val);
+                    ss >> value;
+                    values.insert(value);
+                }
+                predicate = new InListPredicate(index, std::move(values));
+                break;
+            }
+            case OLAP_FIELD_TYPE_DECIMAL: {
+                std::set values;
+                for (auto& cond_val : condition.condition_values) {
+                    decimal12_t value;
+                    value.from_string(cond_val);
+                    values.insert(value);
+                }
+                predicate = new InListPredicate(index, std::move(values));
+                break;
+            }
+            case OLAP_FIELD_TYPE_CHAR: {
+                std::set values;
+                for (auto& cond_val : condition.condition_values) {
+                    StringValue value;
+                    size_t length = std::max(static_cast(fi.length), cond_val.length());
+                    char* buffer = reinterpret_cast(_predicate_mem_pool->allocate(length));
+                    memset(buffer, 0, length);
+                    memory_copy(buffer, cond_val.c_str(), cond_val.length());
+                    value.len = length;
+                    value.ptr = buffer;
+                    values.insert(value);
+                }
+                predicate = new InListPredicate(index, std::move(values));
+                break;
+            }
+            case OLAP_FIELD_TYPE_VARCHAR: {
+                std::set values;
+                for (auto& cond_val : condition.condition_values) {
+                    StringValue value;
+                    int32_t length = cond_val.length();
+                    char* buffer = reinterpret_cast(_predicate_mem_pool->allocate(length));
+                    memory_copy(buffer, cond_val.c_str(), length);
+                    value.len = length;
+                    value.ptr = buffer;
+                    values.insert(value);
+                }
+                predicate = new InListPredicate(index, std::move(values));
+                break;
+            }
+            case OLAP_FIELD_TYPE_DATE: {
+                std::set values;
+                for (auto& cond_val : condition.condition_values) {
+                    uint24_t value = timestamp_from_date(cond_val);
+                    values.insert(value);
+                }
+                predicate = new InListPredicate(index, std::move(values));
+                break;
+            }
+            case OLAP_FIELD_TYPE_DATETIME: {
+                std::set values;
+                for (auto& cond_val : condition.condition_values) {
+                    uint64_t value = timestamp_from_datetime(cond_val);
+                    values.insert(value);
+                }
+                predicate = new InListPredicate(index, std::move(values));
+                break;
+            }
+            default: break;
+        }
+    } else if (condition.condition_op == "is") {
+        bool is_null = false;
+        if (condition.condition_values[0] == "null") {
+            is_null = true;
+        } else {
+            is_null = false;
+        }
+        predicate = new NullPredicate(index, is_null);
+    }
+    return predicate;
+}
+
 OLAPStatus Reader::_init_load_bf_columns(const ReaderParams& read_params) {
     OLAPStatus res = OLAP_SUCCESS;
 
     // add all columns with condition to _load_bf_columns
     for (const auto& cond_column : _conditions.columns()) {
-        for (const Cond& cond : cond_column.second.conds()) {
-            if (cond.op == OP_EQ
-                    || (cond.op == OP_IN && cond.operand_set.size() < MAX_OP_IN_FIELD_NUM)) {
+        for (const Cond* cond : cond_column.second->conds()) {
+            if (cond->op == OP_EQ
+                    || (cond->op == OP_IN && cond->operand_set.size() < MAX_OP_IN_FIELD_NUM)) {
                 _load_bf_columns.insert(cond_column.first);
             }
         }
@@ -692,7 +1134,7 @@ OLAPStatus Reader::_init_load_bf_columns(const ReaderParams& read_params) {
 }
 
 OLAPStatus Reader::_init_delete_condition(const ReaderParams& read_params) {
-    if (read_params.reader_type != READER_CUMULATIVE_EXPANSION) {
+    if (read_params.reader_type != READER_CUMULATIVE_COMPACTION) {
         _olap_table->obtain_header_rdlock();
         OLAPStatus ret = _delete_handler.init(_olap_table, read_params.version.second);
         _olap_table->release_header_lock();
diff --git a/be/src/olap/reader.h b/be/src/olap/reader.h
index 1f184cefc9..a8f1e96c77 100644
--- a/be/src/olap/reader.h
+++ b/be/src/olap/reader.h
@@ -27,18 +27,21 @@
 #include 
 #include 
 
-#include "exprs/expr.h"
 #include "olap/delete_handler.h"
 #include "olap/olap_cond.h"
 #include "olap/olap_define.h"
 #include "olap/row_cursor.h"
 #include "util/runtime_profile.h"
 
+#include "olap/column_predicate.h"
+
 namespace palo {
 
 class OLAPTable;
 class RowCursor;
 class RowBlock;
+class CollectIterator;
+class RuntimeState;
 
 // Params for Reader,
 // mainly include tablet, data version and fetch range.
@@ -52,7 +55,6 @@ struct ReaderParams {
     std::vector start_key;
     std::vector end_key;
     std::vector conditions;
-    std::vector* conjunct_ctxs;
     // The IData will be set when using Merger, eg Cumulative, BE.
     std::vector olap_data_arr;
     std::vector return_columns;
@@ -62,7 +64,6 @@ struct ReaderParams {
     ReaderParams() :
             reader_type(READER_FETCH),
             aggregation(true),
-            conjunct_ctxs(NULL),
             profile(NULL),
             runtime_state(NULL) {
         start_key.clear();
@@ -99,22 +100,8 @@ struct ReaderParams {
 
 class Reader {
 public:
-    Reader() :
-            _is_inited(false),
-            _aggregation(false),
-            _version_locked(false),
-            _reader_type(READER_FETCH),
-            _is_set_data_sources(false),
-            _current_key_index(0),
-            _next_key(NULL),
-            _next_delete_flag(false),
-            _scan_rows(0),
-            _filted_rows(0),
-            _merged_rows(0) {}
-
-    ~Reader() {
-        close();
-    }
+    Reader();
+    ~Reader();
 
     // Initialize Reader with tablet, data version and fetch range.
     OLAPStatus init(const ReaderParams& read_params);
@@ -122,16 +109,20 @@ public:
     void close();
 
     // Reader next row with aggregation.
-    OLAPStatus next_row_with_aggregation(RowCursor *row_cursor, int64_t* raw_rows_read, bool *eof);
+    OLAPStatus next_row_with_aggregation(RowCursor *row_cursor, bool *eof) {
+        return (this->*_next_row_func)(row_cursor, eof);
+    }
 
     uint64_t merged_rows() const {
         return _merged_rows;
     }
 
     uint64_t filted_rows() const {
-        return _filted_rows;
+        return _stats.rows_del_filtered;
     }
 
+    const OlapReaderStatistics& stats() const { return _stats; }
+
 private:
     struct KeysParam {
         ~KeysParam() {
@@ -166,57 +157,7 @@ private:
         std::vector end_keys;
     };
 
-    typedef IData* MergeElement;
-
-    // Use priority_queue as heap to merge multiple data versions.
-    class MergeSet {
-    public:
-        MergeSet() : _heap(NULL) {}
-        ~MergeSet();
-
-        // Hold reader point to get reader params, 
-        // set reverse to true if need read in reverse order.
-        OLAPStatus init(Reader* reader, bool reverse);
-
-        // Add merge element into heap.
-        bool attach(const MergeElement& merge_element, const RowCursor* row);
-
-        // Get top row of the heap, NULL if reach end.
-        const RowCursor* curr(bool* delete_flag);
-
-        // Pop the top element and rebuild the heap to 
-        // get the next row cursor.
-        bool next(const RowCursor** element, bool* delete_flag);
-
-        // Clear the MergeSet element and reset state.
-        bool clear();
-
-    private:
-        // Compare row cursors between multiple merge elements,
-        // if row cursors equal, compare data version.
-        class RowCursorComparator {
-        public:
-            explicit RowCursorComparator(bool reverse) : _reverse(reverse) {}
-            bool operator()(const MergeElement& a, const MergeElement& b);
-
-        private:
-            bool _reverse;
-        };
-
-        typedef std::priority_queue, RowCursorComparator>
-            heap_t;
-        
-        bool _pop_from_heap();
-
-        heap_t* _heap;
-
-        uint64_t _merge_count;
-
-        // Hold reader point to access read params, such as fetch conditions.
-        Reader* _reader;
-    }; 
-
-    friend class MergeSet;
+    friend class CollectIterator;
 
     OLAPStatus _init_params(const ReaderParams& read_params);
 
@@ -226,6 +167,15 @@ private:
 
     OLAPStatus _init_conditions_param(const ReaderParams& read_params);
 
+    ColumnPredicate* _new_eq_pred(FieldInfo& type, int index, const std::string& cond);
+    ColumnPredicate* _new_ne_pred(FieldInfo& type, int index, const std::string& cond);
+    ColumnPredicate* _new_lt_pred(FieldInfo& type, int index, const std::string& cond);
+    ColumnPredicate* _new_le_pred(FieldInfo& type, int index, const std::string& cond);
+    ColumnPredicate* _new_gt_pred(FieldInfo& type, int index, const std::string& cond);
+    ColumnPredicate* _new_ge_pred(FieldInfo& type, int index, const std::string& cond);
+
+    ColumnPredicate* _parse_to_predicate(const TCondition& condition);
+
     OLAPStatus _init_delete_condition(const ReaderParams& read_params);
 
     OLAPStatus _init_return_columns(const ReaderParams& read_params);
@@ -233,45 +183,48 @@ private:
     OLAPStatus _init_load_bf_columns(const ReaderParams& read_params);
 
     OLAPStatus _attach_data_to_merge_set(bool first, bool *eof);
+    
+    OLAPStatus _dup_key_next_row(RowCursor* row_cursor, bool* eof);
+    OLAPStatus _agg_key_next_row(RowCursor* row_cursor, bool* eof);
+    OLAPStatus _unique_key_next_row(RowCursor* row_cursor, bool* eof);
 
-    bool _is_inited;
-    bool _aggregation;
-    bool _version_locked;
-    ReaderType _reader_type;
+private:
+    std::unique_ptr _tracker;
+    std::unique_ptr _predicate_mem_pool;
+    std::set _load_bf_columns;
+    std::vector _return_columns;
 
     Version _version;
 
     SmartOLAPTable _olap_table;
 
+    // _own_data_sources is data source that reader aquire from olap_table, so we need to
+    // release these when reader closing
+    std::vector _own_data_sources;
     std::vector _data_sources;
 
-    // If ReaderParams.olap_data_arr is set out of reader,
-    // will not acquire data sources according to version.
-    bool _is_set_data_sources;
-
     KeysParam _keys_param;
-
-    int32_t _current_key_index;
+    int32_t _next_key_index;
 
     Conditions _conditions;
-
-    std::vector* _query_conjunct_ctxs;
+    std::vector _col_predicates;
 
     DeleteHandler _delete_handler;
 
-    MergeSet _merge_set;
+    OLAPStatus (Reader::*_next_row_func)(RowCursor* row_cursor, bool* eof) = nullptr;
 
-    const RowCursor* _next_key;
+    bool _aggregation;
+    bool _version_locked;
+    ReaderType _reader_type;
     bool _next_delete_flag;
+    const RowCursor* _next_key;
+    CollectIterator* _collect_iter = nullptr;
+    std::vector _key_cids;
+    std::vector _value_cids;
 
-    std::set _load_bf_columns;
-    std::vector _return_columns;
-
-    uint64_t _scan_rows;
-
-    uint64_t _filted_rows;
     uint64_t _merged_rows;
 
+    OlapReaderStatistics _stats;
     DISALLOW_COPY_AND_ASSIGN(Reader);
 
 };
diff --git a/be/src/olap/row_block.cpp b/be/src/olap/row_block.cpp
index 12a55eb258..6cf9d453d5 100644
--- a/be/src/olap/row_block.cpp
+++ b/be/src/olap/row_block.cpp
@@ -33,78 +33,39 @@ using std::upper_bound;
 using std::vector;
 
 namespace palo {
+
 RowBlock::RowBlock(const vector& tablet_schema) :
-        _is_inited(false),
-        _is_use_vectorized(false),
-        _buf_len(0),
-        _fix_row_len(0),
-        _extend_len(0),
-        _used_buf_size(0),
-        _init_buf_len(0),
-        _grid_items_size(0),
-        _init_row_num(0),
-        _tablet_schema(tablet_schema),
-        _grid_items(NULL),
-        _buf(NULL),
-        _string_buf_head_ptr(NULL),
-        _string_buf_array(NULL),
-        _vectorized_row_batch(NULL) {}
+        _capacity(0),
+        _tablet_schema(tablet_schema) {
+    _tracker.reset(new MemTracker(-1));
+    _mem_pool.reset(new MemPool(_tracker.get()));
+}
 
 RowBlock::~RowBlock() {
-    SAFE_DELETE_ARRAY(_grid_items);
-    SAFE_DELETE_ARRAY(_buf);
-    SAFE_DELETE_ARRAY(_string_buf_array);
-    SAFE_DELETE(_vectorized_row_batch);
+    delete[] _mem_buf;
+    delete[] _storage_buf;
 }
 
 OLAPStatus RowBlock::init(const RowBlockInfo& block_info) {
-    if (_is_inited) {
-        OLAP_LOG_WARNING("fail to init RowBlock; RowBlock has been inited.");
-        // 这里ä¸èƒ½goto,å¦åˆ™å°±ä¼šæ”¾æŽ‰buffer
-        return OLAP_ERR_INIT_FAILED;
-    }
-
-    _grid_items = new (nothrow) GridItem[_tablet_schema.size()];
-    if (_grid_items == NULL) {
-        OLAP_LOG_WARNING("fail to malloc '_grid_items'.");
-        return OLAP_ERR_MALLOC_ERROR;
-    }
-
-    _grid_items_size = _tablet_schema.size();
+    _field_count = _tablet_schema.size();
     _info = block_info;
     _data_file_type = block_info.data_file_type;
     _null_supported = block_info.null_supported;
-
-    // 分é…内存
-    if (_allocate_buffer() != OLAP_SUCCESS) {
-        OLAP_LOG_WARNING("fail to allocate buffer for row_block");
-
-        SAFE_DELETE_ARRAY(_grid_items);
-        return OLAP_ERR_MALLOC_ERROR;
-    }
-
-    _set_field_offsets();
-
-    _is_inited = true;
-
+    _capacity = _info.row_num;
+    _compute_layout();
+    _mem_buf = new char[_mem_buf_bytes];
+    _storage_buf = new char[_storage_buf_bytes];
     return OLAP_SUCCESS;
 }
 
-OLAPStatus RowBlock::compress(char* dest_buffer,
-                          size_t dest_len,
-                          size_t* written_len,
-                          OLAPCompressionType compression_type) const {
-    CHECK_ROWBLOCK_INIT();
-    if (dest_buffer == NULL || written_len == NULL) {
-        OLAP_LOG_WARNING("input NULL pointer. [dest_buffer=%p written_len=%p]",
-                         dest_buffer,
-                         written_len);
-        return OLAP_ERR_INPUT_PARAMETER_ERROR;
-    }
-
-    // è¿™é‡Œæœ‰ä¸ªå˜æ›´ï¼Œå°±æ˜¯åŽ‹ç¼©çš„å¤§å°ä¸åœ¨æŒ‰ç…§å›ºå®šçš„大å°åŽ‹ç¼©ï¼Œè€Œæ˜¯æ ¹æ®_used_buf_size压缩
-    return olap_compress(_buf,
-                         _used_buf_size,
+OLAPStatus RowBlock::serialize_to_row_format(
+        char* dest_buffer, size_t dest_len, size_t* written_len,
+        OLAPCompressionType compression_type) {
+    _convert_memory_to_storage(_info.row_num);
+    _info.checksum = olap_crc32(CRC32_INIT, _storage_buf, _storage_buf_used_bytes);
+    _info.unpacked_len = _storage_buf_used_bytes;
+    return olap_compress(_storage_buf,
+                         _storage_buf_used_bytes,
                          dest_buffer,
                          dest_len,
                          written_len,
@@ -114,152 +75,202 @@ OLAPStatus RowBlock::compress(char* dest_buffer,
 OLAPStatus RowBlock::decompress(const char* src_buffer,
                             size_t src_len,
                             OLAPCompressionType compression_type) {
-    CHECK_ROWBLOCK_INIT();
-    if (src_buffer == NULL) {
-        OLAP_LOG_WARNING("input src_buffer is NULL pointer.");
-        return OLAP_ERR_INPUT_PARAMETER_ERROR;
-    }
-
     size_t written_len = 0;
-    OLAPStatus res = OLAP_SUCCESS;
-    res = olap_decompress(src_buffer,
-                          src_len,
-                          _buf,
-                          _buf_len,
-                          &written_len,
-                          compression_type);
+    OLAPStatus res = olap_decompress(src_buffer,
+                                     src_len,
+                                     _storage_buf,
+                                     _storage_buf_bytes,
+                                     &written_len,
+                                     compression_type);
     if (res != OLAP_SUCCESS) {
         OLAP_LOG_WARNING("fail to do olap_decompress. [res=%d]", res);
         return res;
     }
-
-    uint32_t checksum = olap_crc32(CRC32_INIT, _buf, written_len);
-    if (_info.checksum != checksum) {
-        OLAP_LOG_WARNING("crc32 value does not match. [crc32_value=%u _info.checksum=%u]",
-                         checksum,
-                         _info.checksum);
-        return OLAP_ERR_CHECKSUM_ERROR;
-    }
-
-    return res;
-}
-
-OLAPStatus RowBlock::eval_conjuncts(std::vector conjunct_ctxs) {
-    return eval_conjuncts(conjunct_ctxs, _tablet_schema);
-}
-
-OLAPStatus RowBlock::eval_conjuncts(std::vector conjunct_ctxs,
-                                    const std::vector& query_schema) {
-    OLAPStatus status;
-
-    status = _load_to_vectorized_row_batch(query_schema);
-    if (OLAP_SUCCESS != status) {
-        OLAP_LOG_WARNING("fail to convert to vectorized_row_batch.");
-        return status;
-    }
-
-    for (int i = 0; i < conjunct_ctxs.size(); ++i) {
-        if (!conjunct_ctxs[i]->root()->evaluate(_vectorized_row_batch)) {
-            return OLAP_ERR_EVAL_CONJUNCTS_ERROR;
+    if (_need_checksum) {
+        uint32_t checksum = olap_crc32(CRC32_INIT, _storage_buf, written_len);
+        if (_info.checksum != checksum) {
+            OLAP_LOG_WARNING("crc32 value does not match. [crc32_value=%u _info.checksum=%u]",
+                             checksum,
+                             _info.checksum);
+            return OLAP_ERR_CHECKSUM_ERROR;
         }
     }
-    _is_use_vectorized = true;
-
+    _convert_storage_to_memory();
     return OLAP_SUCCESS;
 }
 
-OLAPStatus RowBlock::set_row(uint32_t row_index, const RowCursor& cursor) {
-    CHECK_ROWBLOCK_INIT();
-    if (row_index >= _info.row_num) {
-        OLAP_LOG_WARNING("input 'row_index' exceeds _info.row_num."
-                         "[row_index=%u; _info.row_num=%u]",
-                         row_index,
-                         _info.row_num);
-        return OLAP_ERR_INPUT_PARAMETER_ERROR;
-    }
+void RowBlock::_convert_storage_to_memory() {
+    /*
+     * This function is used to convert storage
+     * layout to memory layout for row-oriented storage.
+     * In this procedure, string type(Varchar/Char/Hyperloglog)
+     * should be converted with caution.
+     * This function will not be called in columnar-oriented storage.
+     */
 
-    if (cursor.field_count() != _grid_items_size) {
-        OLAP_LOG_WARNING("input row cursor is not valid for this row block. "
-                         "[input_cursor_field_count=%lu; row_block_field_count=%lu]",
-                         cursor.field_count(),
-                         _grid_items_size);
-        return OLAP_ERR_INPUT_PARAMETER_ERROR;
-    }
+    char* storage_ptr = _storage_buf;
 
-    OLAPStatus res = OLAP_SUCCESS;
-    size_t field_count = _grid_items_size;
-    for (size_t i = 0; i < field_count; ++i) {
-        char* buf = NULL;
-        if (_tablet_schema[i].type == OLAP_FIELD_TYPE_VARCHAR 
-                || _tablet_schema[i].type == OLAP_FIELD_TYPE_HLL) {
-            buf = _string_buf_array[i].buf_ptr +
-                      //row_num() * _null_byte_num + 
-                      row_index * _string_buf_array[i].string_row_length;
+    // some data file in history not suppored null
+    size_t null_byte = has_nullbyte() ? 1 : 0;
+    for (int col = 0; col < _field_count; ++col) {
+        char* memory_ptr = _mem_buf + _field_offset_in_memory[col];
+        if (_tablet_schema[col].type == OLAP_FIELD_TYPE_VARCHAR ||
+                _tablet_schema[col].type == OLAP_FIELD_TYPE_HLL) {
+            for (int row = 0; row < _info.row_num; ++row) {
+                /*
+                 * Varchar is in offset -> nullbyte|length|content format in storage
+                 * Varchar is in nullbyte|length|ptr in memory
+                 * We need copy three part: nullbyte|length|content
+                 * 1. get values' pointer using offset
+                 * 2. copy null byte
+                 * 3. copy length and content into addrs pointed by ptr
+                 */
+
+                // 1: work out the string pointer by offset
+                uint32_t offset = *reinterpret_cast(storage_ptr);
+                storage_ptr += sizeof(StringOffsetType);
+                char* value_ptr = _storage_buf + offset;
+
+                // 2: copy null byte
+                *reinterpret_cast(memory_ptr) = false;
+                memory_copy(memory_ptr, value_ptr, null_byte);
+
+                // 3. copy length and content
+                size_t storage_field_bytes =
+                    *(StringLengthType*)(value_ptr + null_byte);
+                value_ptr += sizeof(StringLengthType);
+                StringSlice* slice = reinterpret_cast(memory_ptr + 1);
+                slice->data = value_ptr + null_byte;
+                slice->size = storage_field_bytes;
+
+                memory_ptr += _mem_row_bytes;
+            }
+        } else if (_tablet_schema[col].type == OLAP_FIELD_TYPE_CHAR) {
+            size_t storage_field_bytes = _tablet_schema[col].length;
+            for (int row = 0; row < _info.row_num; ++row) {
+                /*
+                 * Char is in nullbyte|content with fixed length in storage
+                 * Char is in nullbyte|length|ptr in memory
+                 * We need copy three part: nullbyte|length|content
+                 * 1. copy null byte
+                 * 2. copy length and content into addrs pointed by ptr
+                 */
+
+                // 1. copy null byte
+                *reinterpret_cast(memory_ptr) = false;
+                memory_copy(memory_ptr, storage_ptr, null_byte);
+
+                // 2. copy length and content
+                StringSlice* slice = reinterpret_cast(memory_ptr + 1);
+                slice->data = storage_ptr + null_byte;
+                slice->size = storage_field_bytes;
+
+                storage_ptr += storage_field_bytes + null_byte;
+                memory_ptr += _mem_row_bytes;
+            }
         } else {
-            buf = _buf + _grid_items[i].offset + row_index * _grid_items[i].width;
-        }
+            size_t storage_field_bytes = _tablet_schema[col].length;
+            for (int row = 0; row < _info.row_num; ++row) {
+                // Content of not string type can be copied using addr
+                *reinterpret_cast(memory_ptr) = false;
+                memory_copy(memory_ptr + 1 - null_byte, storage_ptr, storage_field_bytes + null_byte);
 
-        res = cursor.write_by_index(i, buf);
-        if (res != OLAP_SUCCESS) {
-            OLAP_LOG_WARNING("fail to write field value. [row_index=%u; field_index=%lu res=%d]",
-                             row_index,
-                             i,
-                             res);
-            return res;
+                storage_ptr += storage_field_bytes + null_byte;
+                memory_ptr += _mem_row_bytes;
+            }
         }
     }
+}
 
-    return res;
+void RowBlock::_convert_memory_to_storage(uint32_t num_rows) {
+    // this function is reverse procedure of convert_storage_to_memory
+
+    char* storage_ptr = _storage_buf;
+    // Point to start of storage viriable part
+    char* storage_variable_ptr = _storage_buf + num_rows * _storage_row_fixed_bytes;
+    size_t null_byte = has_nullbyte() ? 1 : 0;
+    for (int col = 0; col < _field_count; ++col) {
+        char* memory_ptr = _mem_buf + _field_offset_in_memory[col];
+        if (_tablet_schema[col].type == OLAP_FIELD_TYPE_VARCHAR ||
+                _tablet_schema[col].type == OLAP_FIELD_TYPE_HLL) {
+            for (int row = 0; row < num_rows; ++row) {
+                /*
+                 * Varchar is in offset -> nullbyte|length|content format in storage
+                 * Varchar is in nullbyte|length|ptr in memory
+                 * We need set three part: offset -> nullbyte|length|content
+                 * 1. set offset
+                 * 2. copy null byte
+                 * 3. copy length and content into sucessive addrs
+                 */
+
+                // 1: set offset
+                size_t offset = storage_variable_ptr - _storage_buf;
+                *reinterpret_cast(storage_ptr) = offset;
+                storage_ptr += sizeof(StringOffsetType);
+
+                // 2: copy null byte
+                memory_copy(storage_variable_ptr, memory_ptr, null_byte);
+                storage_variable_ptr += null_byte;
+
+                // 3. copy length and content
+                StringSlice* slice = reinterpret_cast(memory_ptr + 1);
+                *reinterpret_cast(storage_variable_ptr) = slice->size;
+                storage_variable_ptr += sizeof(StringLengthType);
+                memory_copy(storage_variable_ptr, slice->data, slice->size);
+                storage_variable_ptr += slice->size;
+
+                memory_ptr += _mem_row_bytes;
+            }
+        } else if (_tablet_schema[col].type == OLAP_FIELD_TYPE_CHAR) {
+            size_t storage_field_bytes = _tablet_schema[col].length;
+            for (int row = 0; row < num_rows; ++row) {
+                /*
+                 * Char is in nullbyte|content with fixed length in storage
+                 * Char is in nullbyte|length|ptr in memory
+                 * We need set two part: nullbyte|content
+                 * 1. copy null byte
+                 * 2. copy content
+                 */
+
+                // 1. copy null byte
+                memory_copy(storage_ptr, memory_ptr, null_byte);
+
+                // 2. copy content
+                StringSlice* slice = reinterpret_cast(memory_ptr + 1);
+                memory_copy(storage_ptr + null_byte, slice->data, slice->size);
+                memory_ptr += _mem_row_bytes;
+                storage_ptr += storage_field_bytes + null_byte;
+            }
+        } else {
+            // Memory layout is equal with storage layout, there is nullbyte
+            // for all field. So we need to copy this to storage
+            size_t storage_field_bytes = _tablet_schema[col].length;
+            char* memory_ptr = _mem_buf + _field_offset_in_memory[col];
+            for (int row = 0; row < num_rows; ++row) {
+                memory_copy(storage_ptr, memory_ptr + 1 - null_byte, storage_field_bytes + null_byte);
+                storage_ptr += storage_field_bytes + null_byte;
+                memory_ptr += _mem_row_bytes;
+            }
+        }
+    }
+    _storage_buf_used_bytes = storage_variable_ptr - _storage_buf;
 }
 
 OLAPStatus RowBlock::finalize(uint32_t row_num) {
-    CHECK_ROWBLOCK_INIT();
-    if (row_num > _init_row_num) {
+    if (row_num > _capacity) {
         OLAP_LOG_WARNING("Intput row num is larger than internal row num."
                          "[row_num=%u; _info.row_num=%u]",
                          row_num,
                          _info.row_num);
         return OLAP_ERR_INPUT_PARAMETER_ERROR;
     }
-
-    size_t field_count = _grid_items_size;
-    uint32_t offset = 0;
-    // æž„é€ ç´§è‡´åŽæ¯ä¸ªfield的起点ä½ç½®
-    vector new_offset_array;
-    for (size_t i = 0; i < field_count; ++i) {
-        new_offset_array.push_back(offset);
-        offset += _grid_items[i].width * row_num;
-    }
-
-    //_grid_items[0].offset = row_num * _null_byte_num;
-    for (size_t i = 0; i < field_count; ++i) {
-        char* old_offset = _buf + _grid_items[i].offset;
-        char* new_offset = _buf + new_offset_array[i];
-        uint32_t field_len = _grid_items[i].width;
-
-        memmove(new_offset, old_offset, field_len * row_num);
-        _grid_items[i].offset = new_offset_array[i];
-    }
-
-    // é¢å¤–加入这部分代ç ï¼Œé‡æ–°è°ƒæ•´å—内的åç§»
-    if (_rearrange_string_buffer(row_num, &_used_buf_size)) {
-        OLAP_LOG_WARNING("rearrange varchar buffer failed");
-        return OLAP_ERR_BUFFER_OVERFLOW;
-    }
-
-    // é‡ç½®_info
     _info.row_num = row_num;
-    // 计算checksum
-    _info.checksum = olap_crc32(CRC32_INIT, _buf, _used_buf_size);
-    _info.unpacked_len = _used_buf_size;
-
     return OLAP_SUCCESS;
 }
 
 OLAPStatus RowBlock::find_row(const RowCursor& key,
                               bool find_last,
                               uint32_t* row_index) const {
-    CHECK_ROWBLOCK_INIT();
     if (row_index == NULL) {
         OLAP_LOG_WARNING("input 'row_index' is NULL.");
         return OLAP_ERR_INPUT_PARAMETER_ERROR;
@@ -273,8 +284,7 @@ OLAPStatus RowBlock::find_row(const RowCursor& key,
     }
 
     BinarySearchIterator it_start(0u);
-    BinarySearchIterator it_end(
-            _is_use_vectorized ? _vectorized_row_batch->size() : _info.row_num);
+    BinarySearchIterator it_end(_info.row_num);
     BinarySearchIterator it_result(0u);
 
     RowBlockComparator block_comparator(this, &helper_cursor);
@@ -295,311 +305,58 @@ OLAPStatus RowBlock::find_row(const RowCursor& key,
     return OLAP_SUCCESS;
 }
 
-OLAPStatus RowBlock::backup() {
-    if (NULL == _vectorized_row_batch) {
-        OLAP_LOG_WARNING("fail to backup.");
-        return OLAP_ERR_INPUT_PARAMETER_ERROR;
-    }
-    _vectorized_row_batch->backup();
-
-    return OLAP_SUCCESS;
-}
-
-OLAPStatus RowBlock::restore() {
-    if (NULL == _vectorized_row_batch) {
-        OLAP_LOG_WARNING("fail to backup.");
-        return OLAP_ERR_INPUT_PARAMETER_ERROR;
-    }
-    _vectorized_row_batch->restore();
-
-    return OLAP_SUCCESS;
-}
-
-OLAPStatus RowBlock::clear() {
-    CHECK_ROWBLOCK_INIT();
-
-    _info.row_num = _init_row_num;
+void RowBlock::clear() {
+    _info.row_num = _capacity;
     _info.checksum = 0;
-    _buf_len = _init_buf_len;
-
-    memset(_buf, 0, _buf_len);
-    _set_field_offsets();
-
-    return OLAP_SUCCESS;
+    _mem_pool->clear();
 }
 
-OLAPStatus RowBlock::_load_to_vectorized_row_batch(const std::vector& query_schema) {
-    if (NULL == _vectorized_row_batch) {
-        // TODO(lingbin): is the MemTracker should come from runtime-state?
-        _vectorized_row_batch
-            = new (nothrow) VectorizedRowBatch(_tablet_schema, _info.row_num);
-        if (NULL == _vectorized_row_batch) {
-            OLAP_LOG_WARNING("fail to allocte VectorizedRowBatch.");
-            return OLAP_ERR_MALLOC_ERROR;
-        }
-    }
+void RowBlock::_compute_layout() {
+    size_t memory_size = 0;
+    size_t storage_fixed_bytes = 0;
+    size_t storage_variable_bytes = 0;
+    for (auto& field : _tablet_schema) {
+        _field_offset_in_memory.push_back(memory_size);
 
-    MemPool* mem_pool = _vectorized_row_batch->mem_pool();
-    int size = _vectorized_row_batch->capacity();
-    for (int field_index = 0, query_field_index = 0;
-            field_index < _tablet_schema.size() && query_field_index < query_schema.size();
-            ++field_index) {
-        if (_tablet_schema[field_index].unique_id != query_schema[query_field_index].unique_id
-                || _vectorized_row_batch->column(field_index)->col_data() != NULL) {
-            continue;
-        }
-        ++query_field_index;
-
-        switch (_tablet_schema[field_index].type) {
-        case OLAP_FIELD_TYPE_CHAR: {
-            StringValue* value = reinterpret_cast(
-                                         mem_pool->allocate(
-                                                get_slot_size(TYPE_CHAR) * _info.row_num));
-            char* raw = _buf + _grid_items[field_index].offset;
-            for (int i = 0; i < size; ++i) {
-                value[i].ptr = raw + _grid_items[field_index].width * i;
-                value[i].len = strnlen(value[i].ptr, _tablet_schema[field_index].length);
-            }
-            _vectorized_row_batch->column(field_index)->set_col_data(value);
-            break;
-        }
-        case OLAP_FIELD_TYPE_VARCHAR:
-        case OLAP_FIELD_TYPE_HLL: {
-            typedef uint32_t OffsetValueType;
-            typedef uint16_t LengthValueType;
-            StringValue* value = reinterpret_cast(
-                                         mem_pool->allocate(
-                                                get_slot_size(TYPE_VARCHAR) * _info.row_num));
-            OffsetValueType* offsets = reinterpret_cast(
-                                           _buf + _grid_items[field_index].offset);
-            for (int i = 0; i < size; ++i) {
-                value[i].len
-                    = *reinterpret_cast(_buf + offsets[i]);
-                value[i].ptr = _buf + offsets[i] + sizeof(VarCharField::LengthValueType);
-            }
-
-            _vectorized_row_batch->column(field_index)->set_col_data(value);
-            break;
-        }
-        default: {
-            _vectorized_row_batch->column(field_index)->set_col_data(
-                    _buf + _grid_items[field_index].offset);
-            break;
-        }
-        }
-    }
-    _vectorized_row_batch->set_size(_info.row_num);
-    return OLAP_SUCCESS;
-}
-
-OLAPStatus RowBlock::_allocate_buffer() {
-    OLAPStatus res = OLAP_SUCCESS;
-
-    // 计算fix_row_len
-    _fix_row_len = 0;
-    for (vector::const_iterator iter = _tablet_schema.begin();
-            iter != _tablet_schema.end(); ++iter) {
-        if (iter->type == OLAP_FIELD_TYPE_VARCHAR || iter->type == OLAP_FIELD_TYPE_HLL) {
+        // All field has a nullbyte in memory
+        if (field.type == OLAP_FIELD_TYPE_VARCHAR || field.type == OLAP_FIELD_TYPE_HLL) {
             // å˜é•¿éƒ¨åˆ†é¢å¤–计算下实际最大的字符串长度(此处lengthå·²ç»åŒ…括记录Lengthçš„2个字节)
-            _fix_row_len += sizeof(VarCharField::OffsetValueType);
-            if (OLAP_DATA_FILE == _data_file_type) {
-                if (false == _null_supported) {
-                    _extend_len += iter->length;
-                } else {
-                    _extend_len += iter->length + sizeof(char);
-                }
-            } else if (COLUMN_ORIENTED_FILE == _data_file_type) {
-                _extend_len += iter->length + sizeof(char);
+            storage_fixed_bytes += sizeof(StringOffsetType);
+            storage_variable_bytes += field.length;
+            if (has_nullbyte()) {
+                storage_variable_bytes += sizeof(char);
             }
+            memory_size += sizeof(StringSlice) + sizeof(char);
         } else {
-            // 一般的field无需计算é¢å¤–消耗的空间
-            if (OLAP_DATA_FILE == _data_file_type) {
-                if (false == _null_supported) {
-                    _fix_row_len += iter->length;
-                } else {
-                    _fix_row_len += iter->length + sizeof(char);
-                }
-            } else if (COLUMN_ORIENTED_FILE == _data_file_type) {
-                _fix_row_len += iter->length + sizeof(char);
+            storage_fixed_bytes += field.length;
+            if (has_nullbyte()) {
+                storage_fixed_bytes += sizeof(char);
+            }
+            if (field.type == OLAP_FIELD_TYPE_CHAR) {
+                memory_size += sizeof(StringSlice) + sizeof(char);
+            } else {
+                memory_size += field.length + sizeof(char);
             }
         }
     }
+    _mem_row_bytes = memory_size;
+    _mem_buf_bytes = _mem_row_bytes * _info.row_num;
 
-    //_fix_row_len += _null_byte_num;
-
-    bool auto_allocate = (_info.unpacked_len == 0);
-    if (auto_allocate) {
-        OLAP_LOG_DEBUG("auto_allocate detected");
-        // 釿–°è®¡ç®—一个blockå çš„内存大å°, 定长部分加上一个é¢å¤–çš„ç”¨äºŽé‡æŽ’string的行长
-        _buf_len = (_fix_row_len + _extend_len) * _info.row_num + _extend_len;
-    } else {
-        // å¦‚æžœä¼ å…¥çš„å‚æ•°æœ‰unpack的大å°ï¼Œå°±æ„å‘³ç€æ˜¯è¯»ï¼Œç›´æŽ¥åˆ†é…内存
-        _buf_len = _info.unpacked_len;
-    }
-
-    // 分é…内存
-    if (!_check_memory_limit(_buf_len)) {
-        OLAP_LOG_WARNING("too much memory required.[size=%lu]", _buf_len);
-        res = OLAP_ERR_MALLOC_ERROR;
-        goto ALLOCATE_EXIT;
-    }
-
-    _buf = new (nothrow) char[_buf_len];
-    if (_buf == NULL) {
-        OLAP_LOG_WARNING("fail to alloc memory for _buf. [alloc_size=%lu]", _buf_len);
-        res = OLAP_ERR_MALLOC_ERROR;
-        goto ALLOCATE_EXIT;
-    }
-    memset(_buf, 0, _buf_len);
-
-    if (auto_allocate) {
-        // 这里多出æ¥çš„一行空间是作为交æ¢ç©ºé—´ä½¿ç”¨çš„。
-        // string从定长部分å‘åŽæŽ¨ä¸€è¡Œçš„ä½ç½®å¼€å§‹å­˜ï¼Œè¿™æ ·åœ¨åŽè¾¹æ”¹æˆè‡´å¯†æŽ’列的时候,
-        // å¯ä»¥ç›´æŽ¥ä»ŽåŽå‘剿‹·è´ã€‚
-        _string_buf_head_ptr = _buf + _fix_row_len * (_info.row_num) + _extend_len;
-
-        _string_buf_array = new (nothrow) StringBuffer[_tablet_schema.size()];
-        if (_string_buf_array == NULL) {
-            OLAP_LOG_WARNING("fail to allocate string buffer array");
-            res = OLAP_ERR_MALLOC_ERROR;
-            goto ALLOCATE_EXIT;
-        }
-        // 指å‘字符部分起始ä½ç½®
-        char* buf_helper_ptr = _string_buf_head_ptr;
-        for (size_t i = 0; i < _tablet_schema.size(); i++) {
-            if (_tablet_schema[i].type == OLAP_FIELD_TYPE_VARCHAR || _tablet_schema[i].type == OLAP_FIELD_TYPE_HLL) {
-                // ä¿å­˜è¡Œé•¿å’Œèµ·å§‹çš„æŒ‡é’ˆå³å¯ï¼Œå…¶ä»–å¯é€šè¿‡è®¡ç®—得出
-                _string_buf_array[i].string_row_length = _tablet_schema[i].length;
-                if (OLAP_DATA_FILE == _data_file_type) {
-                    if (false == _null_supported) {
-                        _string_buf_array[i].string_row_length = _tablet_schema[i].length;
-                    } else {
-                        _string_buf_array[i].string_row_length = _tablet_schema[i].length;
-                        _string_buf_array[i].string_row_length += sizeof(char);
-                    }
-                } else if (COLUMN_ORIENTED_FILE == _data_file_type) {
-                    _string_buf_array[i].string_row_length = _tablet_schema[i].length;
-                    _string_buf_array[i].string_row_length += sizeof(char);
-                }
-                _string_buf_array[i].buf_ptr = buf_helper_ptr;
-                // 移动到下一个å˜é•¿fieldçš„ä½ç½®
-                buf_helper_ptr += _string_buf_array[i].string_row_length * _info.row_num;
-            }
-        }
-    }
-    _init_buf_len = _buf_len;
-    _init_row_num = _info.row_num;
-
-ALLOCATE_EXIT:
-    if (res != OLAP_SUCCESS) {
-        SAFE_DELETE_ARRAY(_string_buf_array);
-        SAFE_DELETE_ARRAY(_buf);
-    }
-
-    return res;
-}
-
-OLAPStatus RowBlock::_rearrange_string_buffer(uint32_t row_num, size_t* output_size) {
-    // 考虑到bufferä¸ä¸€å®šè¢«å†™æ»¡ï¼Œæ‰€ä»¥éœ€è¦é‡æ–°è®¡ç®—åç§»
-    char* cur_write_ptr = _buf + _fix_row_len * row_num;
-    // å¦‚æ­¤å®šä¹‰çš„å¥½å¤„æ˜¯ï¼Œä»¥åŽæ”¹å˜äº†ç±»åž‹ï¼Œè¿™è¾¹å¯ä»¥ç›´æŽ¥è·Ÿç€å˜ã€‚
-    VarCharField::LengthValueType* string_length_ptr = NULL;
-    VarCharField::OffsetValueType* cur_offset_ptr = NULL;
-    // 一个block的大å°åº”该ä¸èƒ½è¶…过4G,uint32足矣
-    uint32_t offset = 0;
-    for (size_t col = 0; col < _tablet_schema.size(); ++col) {
-        if (_tablet_schema[col].type == OLAP_FIELD_TYPE_VARCHAR || _tablet_schema[col].type == OLAP_FIELD_TYPE_HLL) {
-            // 指å‘一个新的å˜é•¿å­—符串列
-            char *cur_read_ptr = _string_buf_array[col].buf_ptr;
-            // offset是一列的最大长度
-            offset = _string_buf_array[col].string_row_length;
-            // 这个指针指å‘该å˜é•¿å­—符串列写å移的ä½ç½®ï¼ŒåŽè¾¹éœ€è¦é‡æ–°è®¡ç®—这些åç§»
-            cur_offset_ptr = reinterpret_cast(_buf + _grid_items[col].offset);
-            // é€è¡Œè°ƒæ•´åç§»
-            for (size_t i = 0; i < row_num; i++) {
-                // 获å–长度
-                if (OLAP_DATA_FILE == _data_file_type) {
-                    if (false == _null_supported) {
-                        string_length_ptr = reinterpret_cast(
-                                cur_read_ptr);
-                    } else {
-                        string_length_ptr = reinterpret_cast(
-                                cur_read_ptr + sizeof(char));
-                    }
-                } else if (COLUMN_ORIENTED_FILE == _data_file_type) {
-                    string_length_ptr = reinterpret_cast(
-                            cur_read_ptr + sizeof(char));
-                }
-                // 获å–实际的拷è´é•¿åº¦ï¼Œè¿™ä¸ªé•¿åº¦æ˜¯string长度+ 字符串头表示长度的数字
-                size_t copy_size = 0;
-                if (OLAP_DATA_FILE == _data_file_type) {
-                    if (false == _null_supported) {
-                        copy_size = (*string_length_ptr) +
-                            sizeof(VarCharField::LengthValueType);
-                    } else {
-                        copy_size = (*string_length_ptr) +
-                            sizeof(VarCharField::LengthValueType) + sizeof(char);
-                    }
-                } else if (COLUMN_ORIENTED_FILE == _data_file_type) {
-                    copy_size = (*string_length_ptr) +
-                        sizeof(VarCharField::LengthValueType) + sizeof(char); 
-                }
-                // 其实这里应该是ä¸ä¼šæº¢å‡ºçš„,最多写满
-                if (static_cast(cur_write_ptr - _buf + copy_size) > _buf_len) {
-                    OLAP_LOG_WARNING("not enough buffer, need %lu but has %lu",
-                                     cur_write_ptr - _buf + copy_size,
-                                     _buf_len);
-                    return OLAP_ERR_BUFFER_OVERFLOW;
-                }
-                memcpy(cur_write_ptr, cur_read_ptr, copy_size);
-                // 在offset里ä¿å­˜ä»Ž_bufèµ·çš„åç§»é‡
-                *cur_offset_ptr = cur_write_ptr - _buf;
-                // 移动到下个ä½ç½®
-                cur_read_ptr += offset;
-                cur_write_ptr += copy_size;
-                // 注æ„这里cur_offset_ptr的类型,直接å‘åŽç§»åЍå³å¯ï¼Œä¸èƒ½æŒ‰å­—节数加
-                ++cur_offset_ptr;
-            }
-        }
-    }
-    *output_size = cur_write_ptr - _buf;
-
-    return OLAP_SUCCESS;
-}
-
-void RowBlock::_set_field_offsets() {
-    // åˆå§‹åŒ–field_offset
-    uint32_t offset = 0;
-    for (size_t i = 0; i < _tablet_schema.size(); ++i) {
-        _grid_items[i].offset = offset;
-
-        if (_tablet_schema[i].type == OLAP_FIELD_TYPE_VARCHAR || _tablet_schema[i].type == OLAP_FIELD_TYPE_HLL) {
-            _grid_items[i].width = sizeof(VarCharField::OffsetValueType);
-        } else {
-            if (OLAP_DATA_FILE == _data_file_type) {
-                if (false == _null_supported) {
-                    _grid_items[i].width = _tablet_schema[i].length;
-                } else {
-                    _grid_items[i].width = _tablet_schema[i].length;
-                    _grid_items[i].width += sizeof(char);
-                }
-            } else if (COLUMN_ORIENTED_FILE == _data_file_type) {
-                _grid_items[i].width = _tablet_schema[i].length;
-                _grid_items[i].width += sizeof(char);
-            }
-        }
-
-        offset += _grid_items[i].width * _info.row_num;
+    _storage_row_fixed_bytes = storage_fixed_bytes;
+    _storage_buf_bytes = (storage_fixed_bytes + storage_variable_bytes) * _info.row_num;
+    if (_info.unpacked_len != 0) {
+        // If we already known unpacked length, just use this length
+        _storage_buf_bytes = _info.unpacked_len;
     }
 }
 
-inline bool RowBlock::_check_memory_limit(size_t _buf_len) const {
+inline bool RowBlock::_check_memory_limit(size_t buf_len) const {
     uint64_t max_unpacked_row_block_size = config::max_unpacked_row_block_size;
     max_unpacked_row_block_size =
             max_unpacked_row_block_size == 0 ? OLAP_DEFAULT_MAX_UNPACKED_ROW_BLOCK_SIZE
             : max_unpacked_row_block_size;
 
-    return _buf_len <= max_unpacked_row_block_size;
+    return buf_len <= max_unpacked_row_block_size;
 }
 
 }  // namespace palo
diff --git a/be/src/olap/row_block.h b/be/src/olap/row_block.h
index 69f1fbeb45..adb80af109 100644
--- a/be/src/olap/row_block.h
+++ b/be/src/olap/row_block.h
@@ -27,12 +27,6 @@
 #include "olap/utils.h"
 #include "runtime/vectorized_row_batch.h"
 
-#define CHECK_ROWBLOCK_INIT() \
-    if (!_is_inited) {\
-        OLAP_LOG_WARNING("fail to use uninited RowBlock.");\
-        return OLAP_ERR_NOT_INITED;\
-    }
-
 namespace palo {
 
 class ExprContext;
@@ -51,20 +45,6 @@ struct RowBlockInfo {
     bool null_supported;
 };
 
-// 用于rowblockåšè¡Œåˆ—转æ¢çš„æ ‡å°ºï¼Œç”¨æ¥è¾…助åšè¡Œåˆ—转æ¢ã€‚
-// offset是列存模å¼ä¸‹æŸä¸€åˆ—的存储起始ä½ç½®ï¼Œwidth是æŸä¸€åˆ—的存储长度
-struct GridItem {
-    uint32_t offset;
-    uint32_t width;
-};
-
-struct StringBuffer {
-    StringBuffer(): buf_ptr(NULL) {}
-
-    uint32_t string_row_length;
-    char* buf_ptr;
-};
-
 // 一般由256或512行组æˆä¸€ä¸ªRowBlock。
 // RowBlock类有如下èŒè´£ï¼š
 // 1. 外界从ç£ç›˜ä¸Šè¯»å–未解压数æ®ï¼Œç”¨decompress函数传给RowBlock,解压åŽçš„æ•°æ®ä¿å­˜åœ¨
@@ -76,6 +56,7 @@ class RowBlock {
     // Please keep these classes as 'friend'.  They have to use lots of private fields for
     // faster operation.
     friend class RowBlockChanger;
+    friend class VectorizedRowBatch;
 public:
     RowBlock(const std::vector& tablet_schema);
 
@@ -86,49 +67,23 @@ public:
     // 在field都为定长的情况下根æ®è¿™ä¸¤ä¸ªå€¼å¯ä»¥ç¡®å®šRowBlock内部buffer的大å°ï¼Œ
     // ç›®å‰åªè€ƒè™‘定长,因此在函数å¯ä»¥åˆ†é…内存资æºã€‚
     OLAPStatus init(const RowBlockInfo& block_info);
-    inline void reset_block() {
-        memset(_buf, 0, _buf_len);
-    }
 
-    // 将内部buffer的内容压缩并输出
-    OLAPStatus compress(char* dest_buffer,
-                    size_t dest_len,
-                    size_t* written_len,
-                    OLAPCompressionType compression_type) const;
+    // serialize memory content to row format
+    OLAPStatus serialize_to_row_format(char* dest_buffer,
+                                       size_t dest_len,
+                                       size_t* written_len,
+                                       OLAPCompressionType compression_type);
 
     // 将外部buffer中的压缩数æ®è§£åŽ‹åˆ°æœ¬åœ°çš„buffer, 如果里é¢å·²ç»æœ‰æ•°æ®äº†ï¼Œåˆ™è¦†ç›–。
     OLAPStatus decompress(const char* src_buffer,
                           size_t src_len,
                           OLAPCompressionType compression_type);
 
-    // å‘é‡åŒ–地执行过滤æ¡ä»¶
-    // columnsä¿¡æ¯ç”¨äºŽVectorizedRowBatch的延迟加载
-    OLAPStatus eval_conjuncts(std::vector conjuncts);
+    inline void get_row(uint32_t row_index, RowCursor* cursor) const {
+        cursor->attach(_mem_buf + row_index * _mem_row_bytes);
+    }
 
-    OLAPStatus eval_conjuncts(std::vector conjuncts,
-                              const std::vector& query_schema);
-
-    inline OLAPStatus get_row_to_write(uint32_t row_index, 
-                                      RowCursor* cursor) const;
-
-    // æ ¹æ®è¡Œåç§»é‡ï¼Œè®¾ç½®RowCursorçš„fieldåç§»
-    // _is_use_vectorized为true,则从ç»è¿‡å‘é‡åŒ–æ¡ä»¶è¿‡æ»¤çš„VectorizedRowBatch中读å–
-    // 此时row_index表示VectorizedRowBatch中的行åç§»
-    // å之读å–原始数æ®ï¼Œrow_index表示原始数æ®çš„行åç§»
-    inline OLAPStatus get_row_to_read(uint32_t row_index, 
-                                      RowCursor* cursor) const;
-
-    inline OLAPStatus get_row_to_read(uint32_t row_index, 
-                                      RowCursor* cursor,
-                                      bool force_read_raw_data) const;
-
-    // 按照给定的行åºå·å†™å…¥ä¸€è¡Œæ•°æ®åˆ°å†…部buf中
-    OLAPStatus set_row(uint32_t row_index, const RowCursor& cursor);
-
-    // ç»“æŸæœ¬æ‰¹æ¬¡rowblock的写入行为,如果传入的row_num与内部åˆå§‹åŒ–传入的row_numä¸åŒï¼Œ
-    // 则自己åšç´§è‡´åŒ–,并修改内部的row_num
-    // finalize之åŽä¸èƒ½å†ç”¨set_row写入,调用clear坿¢å¤åˆå§‹çжæ€
-    // finalize会计算checksum的值,填入_info.checksum
+    // called when finished fill this row_block
     OLAPStatus finalize(uint32_t row_num);
 
     // æ ¹æ®key的值在RowBlock内部åšäºŒåˆ†æŸ¥æ‰¾ï¼Œè¿”回第一æ¡å¯¹åº”çš„row_index,
@@ -138,44 +93,35 @@ public:
                         bool find_last, 
                         uint32_t* row_index) const;
 
-    OLAPStatus backup();
+    const uint32_t row_num() const { return _info.row_num; }
+    const RowBlockInfo& row_block_info() const { return _info; }
+    const std::vector& tablet_schema() const { return _tablet_schema; }
+    size_t buf_len() const { return _storage_buf_bytes; }
 
-    OLAPStatus restore();
+    size_t capacity() const { return _capacity; }
 
-    const uint32_t row_num() const {
-        return _is_use_vectorized ? _vectorized_row_batch->size() : _info.row_num;
+    // Return field pointer, this pointer point to the nullbyte before the field
+    // layout is nullbyte|Field
+    inline char* field_ptr(size_t row, size_t col) const {
+        return _mem_buf + _mem_row_bytes * row + _field_offset_in_memory[col];
     }
 
-    const RowBlockInfo& row_block_info() const {
-        return _info;
-    }
-
-    const std::vector& tablet_schema() const {
-        return _tablet_schema;
-    }
-
-    size_t buf_len() const {
-        return _buf_len;
-    }
-
-    char* buf() const {
-        return _buf;
-    }
-
-    // 这个å˜é‡æ˜¯ç”¨æ¥è®°å½•这个m_buf内实际使用的字节数
-    size_t used_buf_len() const {
-        return _used_buf_size;
-    }
-
-    size_t allocated_row_num() const {
-        return _init_row_num;
+    MemPool* mem_pool() const {
+        return _mem_pool.get();
     }
 
     // é‡ç”¨rowblock之å‰éœ€è°ƒç”¨clear,æ¢å¤åˆ°init之åŽçš„原始状æ€
-    OLAPStatus clear();
+    void clear();
 
-    // 分é…内存部分å•独拿出æ¥
-    OLAPStatus _allocate_buffer();
+    size_t pos() const { return _pos; }
+    void set_pos(size_t pos) { _pos = pos; }
+    void pos_inc() { _pos++; }
+    size_t limit() const { return _limit; }
+    void set_limit(size_t limit) { _limit = limit; }
+    size_t remaining() const { return _limit - _pos; }
+    bool has_remaining() const { return _pos < _limit; }
+    uint8_t block_status() const { return _block_status; }
+    void set_block_status(uint8_t status) { _block_status = status; }
 
 private:
     // 仿函数里,根æ®iteratorçš„operator*返回的åºå·èŽ·å–æ•°æ®ç»“构的值,
@@ -186,8 +132,6 @@ private:
                            RowCursor* helper_cursor) :
                 _container(container),
                 _helper_cursor(helper_cursor) {}
-
-        // å› ä¸ºæ˜¯ä»¿å‡½æ•°ï¼Œæ‰€ä»¥æžæž„函数ä¸éœ€è¦delete指针æˆå‘˜
         ~RowBlockComparator() {}
         
         // less comparator
@@ -203,14 +147,7 @@ private:
         bool _compare(const iterator_offset_t& index,
                       const RowCursor& key,
                       ComparatorEnum comparator_enum) const {
-            OLAPStatus res = OLAP_SUCCESS;
-
-            res = _container->get_row_to_read(index, _helper_cursor);
-            if (res != OLAP_SUCCESS) {
-                OLAP_LOG_WARNING("fail to get row to read. [res=%d]", res);
-                throw ComparatorException();
-            }
-
+            _container->get_row(index, _helper_cursor);
             if (comparator_enum == COMPARATOR_LESS) {
                 return _helper_cursor->cmp(key) < 0;
             } else {
@@ -222,204 +159,68 @@ private:
         RowCursor* _helper_cursor;
     };
 
-    OLAPStatus _load_to_vectorized_row_batch(const std::vector& query_schema);
+    bool has_nullbyte() {
+        return _data_file_type == COLUMN_ORIENTED_FILE || _null_supported;
+    }
 
-    // rearrange string buffer
-    OLAPStatus _rearrange_string_buffer(uint32_t row_num, size_t* output_size);
+    // Compute layout for storage buffer and  memory buffer
+    void _compute_layout();
 
-    // è®¾ç½®å†…éƒ¨è¡Œåˆ—è½¬æ¢æ ‡å°º
-    void _set_field_offsets();
+    bool _check_memory_limit(size_t buf_len) const;
 
-    bool _check_memory_limit(size_t _buf_len) const;
+    // Fill memory buffer from decompressed buffer, used in decompress function
+    void _convert_storage_to_memory();
 
-    bool _is_inited;           // æ˜¯å¦æ­£å¸¸å®Œæˆåˆå§‹åŒ–
-    bool _is_use_vectorized;   // æ˜¯å¦æ‰§è¡Œå‘é‡åŒ–æ¡ä»¶è¿‡æ»¤
-    size_t _buf_len;           // buffer长度
-    size_t _fix_row_len;       // buffer中,定长部分的长度
-    size_t _extend_len;
-    size_t _used_buf_size;     // 这个å˜é‡æ˜¯ç”¨æ¥è®°å½•这个m_buf内实际使用的字节数
-    size_t _init_buf_len;      // åšå®Œinit之åŽçš„buf_len
-    size_t _grid_items_size;
-    uint32_t _init_row_num;    // åšå®Œinit之åŽçš„row_num
-    RowBlockInfo _info;        // 头信æ¯
+    // Fill storage buffer from memory buffer, prepare formated data to save in storage
+    void _convert_memory_to_storage(uint32_t row_num);
+
+    uint32_t _capacity;
+    RowBlockInfo _info;
     const std::vector& _tablet_schema;     // 内部ä¿å­˜çš„schema奿Ÿ„
 
-    GridItem* _grid_items;     // 替æ¢_field_offset实现
-    char* _buf;                // ä¿å­˜è§£åŽ‹åŽæ•°æ®çš„buffer
-    char* _string_buf_head_ptr;
-    StringBuffer* _string_buf_array;
-    //size_t _null_field_num;
-    size_t _null_byte_num;
     bool _null_supported;
     DataFileType _data_file_type;
 
-    VectorizedRowBatch* _vectorized_row_batch;
+    size_t _field_count = 0;
+    bool _need_checksum = true;
 
+    // Data in memory is construct from row cursors, these row cursors's size is equal
+    char* _mem_buf = nullptr;
+    // equal with _mem_row_bytes * _info.row_num
+    size_t _mem_buf_bytes = 0;
+    // row's size in bytes, in one block, all rows's size is equal
+    size_t _mem_row_bytes = 0;
+
+    // Field offset of memory row format, used to get field ptr in memory row
+    std::vector _field_offset_in_memory;
+
+    // Data in storage will be construct of two parts: fixed-length field stored in ahead
+    // of buffer; content of variable length field(Varchar/HLL) are stored after first part
+
+    // used to save data which will read from/wirte to storage.
+    // when compress, data in _mem_buf will be converted to format in storage here
+    // when decompress, compressed data is decompressed here, then covert to _mem_buf
+    char* _storage_buf = nullptr;
+    // Size of _storage_buf
+    size_t _storage_buf_bytes = 0;
+    // size of storage row's fixed part
+    size_t _storage_row_fixed_bytes = 0;
+    // Used size. when convert memory buffer to storage buffer, because not all values of
+    // varchar field are max length
+    size_t _storage_buf_used_bytes = 0;
+
+    // only used for SegmentReader to covert VectorizedRowBatch to RowBlock
+    // Be careful to use this
+    size_t _pos = 0;
+    size_t _limit = 0;
+    uint8_t _block_status = DEL_PARTIAL_SATISFIED;
+
+    std::unique_ptr _tracker;
+    std::unique_ptr _mem_pool;
     // ç”±äºŽå†…éƒ¨æŒæœ‰å†…存资æºï¼Œæ‰€ä»¥è¿™é‡Œç¦æ­¢æ‹·è´å’Œèµ‹å€¼
     DISALLOW_COPY_AND_ASSIGN(RowBlock);
 };
 
-// 写在头文件中,便于编译器inline
-inline OLAPStatus RowBlock::get_row_to_write(uint32_t row_index, RowCursor* cursor) const {
-    CHECK_ROWBLOCK_INIT();
-
-    if (row_index >= _info.row_num) {
-        OLAP_LOG_WARNING("input row index exceeds row_num in row block info."
-                         "[row_index=%u; _info.row_num=%u]",
-                         row_index,
-                         _info.row_num);
-        return OLAP_ERR_INPUT_PARAMETER_ERROR;
-    }
-
-    if (cursor == NULL) {
-        OLAP_LOG_WARNING("input row cursor is NULL pointer.");
-        return OLAP_ERR_INPUT_PARAMETER_ERROR;
-    }
-
-    if (cursor->field_count() != _grid_items_size) {
-        OLAP_LOG_WARNING("input row cursor is invalid for this row block. "
-                         "[input_cursor_field_count=%lu; row_block_field_count=%lu]",
-                         cursor->field_count(),
-                         _grid_items_size);
-        return OLAP_ERR_INPUT_PARAMETER_ERROR;
-    }
-
-    OLAPStatus res = OLAP_SUCCESS;
-
-    for (uint32_t i = 0; i < _grid_items_size; ++i) {
-        // 如果是varchar类型,è¦å…ˆåšä¸€ä¸‹å移,让string被写入到bufåŽè¾¹çš„éžå®šé•¿éƒ¨åˆ†
-        if (_tablet_schema[i].type == OLAP_FIELD_TYPE_VARCHAR 
-                || _tablet_schema[i].type == OLAP_FIELD_TYPE_HLL) {
-            if (OLAP_DATA_FILE == _data_file_type) {
-                if (false == _null_supported) {
-                    res = cursor->attach_by_index(i, _string_buf_array[i].buf_ptr +
-                                  row_index * _string_buf_array[i].string_row_length, false);
-                } else {
-                    res = cursor->attach_by_index(i, _string_buf_array[i].buf_ptr +
-                                  row_index * _string_buf_array[i].string_row_length, true);
-                }
-            } else if (COLUMN_ORIENTED_FILE == _data_file_type) {
-                res = cursor->attach_by_index(i, _string_buf_array[i].buf_ptr +
-                              row_index * _string_buf_array[i].string_row_length, true);
-            }
-        } else {
-            if (OLAP_DATA_FILE == _data_file_type) {
-                if (false == _null_supported) {
-                    res = cursor->attach_by_index(i, _buf + _grid_items[i].offset +
-                                  row_index * _grid_items[i].width, false);
-                } else {
-                    res = cursor->attach_by_index(i, _buf + _grid_items[i].offset +
-                                  row_index * _grid_items[i].width, true);
-                }
-            } else if (COLUMN_ORIENTED_FILE == _data_file_type) {
-                    res = cursor->attach_by_index(i, _buf + _grid_items[i].offset +
-                                  row_index * _grid_items[i].width, true);
-            }
-        }
-
-        if (res != OLAP_SUCCESS) {
-            OLAP_LOG_WARNING("fail to set cursor offsets. "
-                             "[res=%d row_index=%u field_index=%u]",
-                             res, row_index, i);
-            return res;
-        }
-    }
-
-    return res;
-}
-
-inline OLAPStatus RowBlock::get_row_to_read(uint32_t row_index, 
-                                            RowCursor* cursor,
-                                            bool force_read_raw_data) const {
-    CHECK_ROWBLOCK_INIT();
-
-    if (cursor == NULL) {
-        OLAP_LOG_WARNING("input row cursor is NULL pointer.");
-        return OLAP_ERR_INPUT_PARAMETER_ERROR;
-    }
-
-    if (cursor->field_count() != _grid_items_size) {
-        OLAP_LOG_WARNING("input row cursor is invalid for this row block. "
-                         "[input_cursor_field_count=%lu row_block_field_count=%lu]",
-                         cursor->field_count(),
-                         _grid_items_size);
-        return OLAP_ERR_INPUT_PARAMETER_ERROR;
-    }
-
-    if (!force_read_raw_data && _is_use_vectorized) {
-        if (NULL == _vectorized_row_batch) {
-            OLAP_LOG_WARNING("fail to get_row_to_read since _vectorized_row_batch is NULL");
-            return OLAP_ERR_INPUT_PARAMETER_ERROR;
-        }
-        if (row_index >= _vectorized_row_batch->size()) {
-            OLAP_LOG_WARNING("input row_index exceeds _vectorized_row_batch.size."
-                             "[row_index=%u; _vectorized_row_batch.size=%u]",
-                             row_index,
-                             _vectorized_row_batch->size());
-            return OLAP_ERR_INPUT_PARAMETER_ERROR;
-        }
-        if (_vectorized_row_batch->selected_in_use()) {
-            row_index = _vectorized_row_batch->selected()[row_index];
-        }
-    } else {
-        if (row_index >= _info.row_num) {
-            OLAP_LOG_WARNING("input row_index exceeds _info.row_num."
-                             "[row_index=%u; _info.row_num=%u]",
-                             row_index,
-                             _info.row_num);
-            return OLAP_ERR_INPUT_PARAMETER_ERROR;
-        }
-    }
-
-    OLAPStatus res = OLAP_SUCCESS;
-    VarCharField::OffsetValueType* offset;
-
-    //cursor->attach_null_array(_buf + row_index * _null_byte_num);
-    for (uint32_t i = 0; i < _grid_items_size; ++i) {
-        if (_tablet_schema[i].type == OLAP_FIELD_TYPE_VARCHAR 
-                || _tablet_schema[i].type == OLAP_FIELD_TYPE_HLL) {
-            offset = reinterpret_cast(
-                             _buf + _grid_items[i].offset + row_index * _grid_items[i].width);
-            if (OLAP_DATA_FILE == _data_file_type) {
-                if (false == _null_supported) {
-                    res = cursor->attach_by_index(i, _buf + (*offset), false);
-                } else {
-                    res = cursor->attach_by_index(i, _buf + (*offset), true);
-                }
-            } else if (COLUMN_ORIENTED_FILE == _data_file_type) {
-                    res = cursor->attach_by_index(i, _buf + (*offset), true);
-            }
-        } else {
-            if (OLAP_DATA_FILE == _data_file_type) {
-                if (false == _null_supported) {
-                    res = cursor->attach_by_index(i, _buf +
-                                  _grid_items[i].offset + row_index * _grid_items[i].width, false);
-                } else {
-                    res = cursor->attach_by_index(i, _buf +
-                                  _grid_items[i].offset + row_index * _grid_items[i].width, true);
-                }
-            } else if (COLUMN_ORIENTED_FILE == _data_file_type) {
-                    res = cursor->attach_by_index(i, _buf +
-                                  _grid_items[i].offset + row_index * _grid_items[i].width, true);
-            }
-        }
-
-        if (res != OLAP_SUCCESS) {
-            OLAP_LOG_WARNING("fail to set cursor offsets."
-                             "[res=%d row_index=%u field_index=%u]",
-                             res, row_index, i);
-            return res;
-        }
-    }
-    return res;
-}
-
-// æ­¤å‡½æ•°ä¿æŒå’Œä»¥å‰ä¸€æ ·
-inline OLAPStatus RowBlock::get_row_to_read(uint32_t row_index, 
-                                            RowCursor* cursor) const {
-    return get_row_to_read(row_index, cursor, false);
-}
-
 }  // namespace palo
 
 #endif // BDG_PALO_BE_SRC_OLAP_ROW_BLOCK_H
diff --git a/be/src/olap/row_cursor.cpp b/be/src/olap/row_cursor.cpp
index df7b10a801..a7c35715ce 100644
--- a/be/src/olap/row_cursor.cpp
+++ b/be/src/olap/row_cursor.cpp
@@ -24,33 +24,90 @@ using std::vector;
 
 namespace palo {
 RowCursor::RowCursor() :
-        _field_array(NULL),
-        _field_array_size(0),
-        _columns(NULL),
-        _columns_size(0),
         _key_column_num(0),
-        _length(0),
-        _length_mysql(0),
-        _field_length_array(NULL),
-        _field_offset(NULL),
-        _is_inited(false),
-        _buf(NULL),
-        _is_mysql_compatible(true) {}
+        _fixed_len(0),
+        _variable_len(0),
+        _variable_buf_allocated_by_pool(false) {}
 
 RowCursor::~RowCursor() {
     // delete RowCursor's Fields
-    for (size_t i = 0; i < _field_array_size; ++i) {
-        SAFE_DELETE(_field_array[i]);
+    for (auto field : _field_array) {
+        delete field;
     }
 
-    _is_inited = false;
     _key_column_num = 0;
-    
-    SAFE_DELETE_ARRAY(_field_array);
-    SAFE_DELETE_ARRAY(_field_length_array);
-    SAFE_DELETE_ARRAY(_field_offset);
-    SAFE_DELETE_ARRAY(_columns);
-    SAFE_DELETE_ARRAY(_buf);
+
+    delete [] _owned_fixed_buf;
+    if (!_variable_buf_allocated_by_pool) {
+        for (HllContext* context : hll_contexts) {
+            delete context;
+        }
+
+        delete [] _variable_buf;
+    }
+}
+
+OLAPStatus RowCursor::_init(const std::vector& tablet_schema,
+                            const std::vector& columns) {
+    _field_array.resize(tablet_schema.size(), nullptr);
+    _columns = columns;
+
+    std::vector field_buf_lens;
+    for (size_t i = 0; i < tablet_schema.size(); ++i) {
+        FieldType type = tablet_schema[i].type;
+        if (type == OLAP_FIELD_TYPE_CHAR ||
+            type == OLAP_FIELD_TYPE_VARCHAR ||
+            type == OLAP_FIELD_TYPE_HLL) {
+            field_buf_lens.push_back(sizeof(StringSlice));
+        } else {
+            field_buf_lens.push_back(tablet_schema[i].length);
+        }
+    }
+
+    _key_column_num = tablet_schema.size();
+    for (size_t i = tablet_schema.size() - 1; i >= 0; --i) {
+        if (tablet_schema[i].is_key) {
+            _key_column_num = i + 1;
+            break;
+        }
+    }
+
+    _fixed_len = 0;
+    _variable_len = 0;
+    for (auto cid : _columns) {
+        _field_array[cid] = Field::create(tablet_schema[cid]);
+        if (_field_array[cid] == NULL) {
+            OLAP_LOG_WARNING("Fail to create field.");
+            return OLAP_ERR_INIT_FAILED;
+        }
+        _fixed_len += field_buf_lens[cid] + 1; //1 for null byte
+        FieldType type = tablet_schema[cid].type;
+        if (type == OLAP_FIELD_TYPE_VARCHAR) {
+            _variable_len += tablet_schema[cid].length - OLAP_STRING_MAX_BYTES;
+        } else if (type == OLAP_FIELD_TYPE_CHAR) {
+            _variable_len += tablet_schema[cid].length;
+        } else if (type == OLAP_FIELD_TYPE_HLL) {
+            _variable_len += HLL_COLUMN_DEFAULT_LEN + sizeof(HllContext*);
+        }
+    }
+
+    _fixed_buf = new (nothrow) char[_fixed_len];
+    if (_fixed_buf == nullptr) {
+        OLAP_LOG_WARNING("Fail to malloc _fixed_buf.");
+        return OLAP_ERR_MALLOC_ERROR;
+    }
+    _owned_fixed_buf = _fixed_buf;
+    memset(_fixed_buf, 0, _fixed_len);
+
+    _field_offsets.resize(tablet_schema.size(), -1);
+    size_t offset = 0;
+    for (auto cid : _columns) {
+        _field_offsets[cid] = offset;
+        _field_array[cid]->set_offset(offset);
+        offset += field_buf_lens[cid] + 1;
+    }
+
+    return OLAP_SUCCESS;
 }
 
 OLAPStatus RowCursor::init(const vector& tablet_schema) {
@@ -58,186 +115,155 @@ OLAPStatus RowCursor::init(const vector& tablet_schema) {
 }
 
 OLAPStatus RowCursor::init(const vector& tablet_schema, size_t column_count) {
-    if (_is_inited) {
-        OLAP_LOG_WARNING("Fail to init RowCursor; RowCursor has been inited.");
-        
-        return OLAP_ERR_INIT_FAILED;
-    } else if (column_count > tablet_schema.size()) {
+    if (column_count > tablet_schema.size()) {
         OLAP_LOG_WARNING("input param are invalid. Column count is bigger than table schema size."
                          "[column_count=%lu tablet_schema.size=%lu]",
                          column_count,
                          tablet_schema.size());
-        
         return OLAP_ERR_INPUT_PARAMETER_ERROR;
     }
 
-    // create an array of fields with size = column_count
-    vector columns(column_count);
-    for (uint32_t i = 0; i < column_count; ++i) {
-        columns[i] = i;
+    std::vector columns;
+    for (size_t i = 0; i < column_count; ++i) {
+        columns.push_back(i);
     }
-    
-    return _init(tablet_schema, columns, nullptr);
-}
-
-OLAPStatus RowCursor::init_keys(const std::vector& tablet_schema, 
-                         const std::vector& keys) {
-    std::vector lengths;
-    for (int i = 0; i < keys.size(); i++) {
-        lengths.push_back(keys[i].length());
-    }
-    return init_keys(tablet_schema, lengths);
-}
-
-OLAPStatus RowCursor::init_keys(const std::vector& tablet_schema, 
-                    const std::vector& lengths) {
-    if (_is_inited) {
-        OLAP_LOG_WARNING("Fail to init RowCursor; RowCursor has been inited.");
-
-        return OLAP_ERR_INIT_FAILED;
-    } else if (lengths.size() > tablet_schema.size()) {
-        OLAP_LOG_WARNING("input param are invalid. Column count is bigger than table schema size."
-                         "[column_count=%lu tablet_schema.size=%lu]",
-                         lengths.size(),
-                         tablet_schema.size());
-
-        return OLAP_ERR_INPUT_PARAMETER_ERROR;
-    }
-
-    vector columns(lengths.size());
-    for (uint32_t i = 0; i < lengths.size(); ++i) {
-        columns[i] = i;
-    }
-
-    return _init(tablet_schema, columns, &lengths);
+    RETURN_NOT_OK(_init(tablet_schema, columns));
+    return OLAP_SUCCESS;
 }
 
 OLAPStatus RowCursor::init(
         const vector& tablet_schema,
         const vector& columns) {
-    return _init(tablet_schema, columns, nullptr);
+    RETURN_NOT_OK(_init(tablet_schema, columns));
+    return OLAP_SUCCESS;
 }
-    
-OLAPStatus RowCursor::_init(
-        const vector& tablet_schema,
-        const vector& columns, const std::vector* lengths) {
-    // OLAP里é¢å¼ºåˆ¶éœ€è¦schema里é¢key在å‰ï¼Œvalue在åŽ
-    _field_array_size = tablet_schema.size();
-    _field_array = new (nothrow) Field*[tablet_schema.size()];
-    _columns = new (nothrow) uint32_t[columns.size()];
-    _field_length_array = new (nothrow) size_t[_field_array_size];
-    _field_offset = new (nothrow) size_t[_field_array_size];
-    if (_field_array == NULL
-            || _columns == NULL
-            || _field_offset == NULL
-            || _field_length_array == NULL) {
-        OLAP_LOG_WARNING("Fail to malloc internal structures."
-                         "[tablet_schema_size=%lu; columns_size=%lu]",
-                         tablet_schema.size(),
-                         columns.size());
-        
-        return OLAP_ERR_MALLOC_ERROR;
+
+OLAPStatus RowCursor::init_scan_key(const std::vector& tablet_schema,
+                                    const std::vector& scan_keys) {
+    size_t scan_key_size = scan_keys.size();
+    if (scan_key_size > tablet_schema.size()) {
+        OLAP_LOG_WARNING("input param are invalid. Column count is bigger than table schema size."
+                         "[column_count=%lu tablet_schema.size=%lu]",
+                         scan_key_size ,
+                         tablet_schema.size());
+        return OLAP_ERR_INPUT_PARAMETER_ERROR;
     }
 
-    // 因为区间key有å¯èƒ½å¤§äºŽschemaçš„length,所以å–大
-    vector field_buf_lens;
-    for (size_t i = 0; i < _field_array_size; ++i) {
-        if (lengths != nullptr && i < lengths->size()) {
-            size_t buf_len = 0;
-            switch (tablet_schema[i].type) {
-                case OLAP_FIELD_TYPE_VARCHAR:
-                    buf_len = (*lengths)[i] + sizeof(VarCharField::LengthValueType);
-                    break;
-                case OLAP_FIELD_TYPE_CHAR:
-                    buf_len = (*lengths)[i];
-                    break;
-                default:
-                    ;
+    std::vector columns;
+    for (size_t i = 0; i < scan_key_size; ++i) {
+        columns.push_back(i);
+    }
+
+    RETURN_NOT_OK(_init(tablet_schema, columns));
+
+    // NOTE: cid equal with column index
+    // Hyperloglog cannot be key, no need to handle it
+    _variable_len = 0;
+    for (auto cid : _columns) {
+        FieldType type = tablet_schema[cid].type;
+        if (type == OLAP_FIELD_TYPE_VARCHAR) {
+            _variable_len += scan_keys[cid].length();
+        } else if (type == OLAP_FIELD_TYPE_CHAR) {
+            _variable_len += std::max(
+                scan_keys[cid].length(), (size_t)(tablet_schema[cid].length));
+        }
+    }
+
+    // variable_len for null bytes
+    _variable_buf = new (nothrow) char[_variable_len];
+    if (_variable_buf == NULL) {
+        OLAP_LOG_WARNING("Fail to malloc _variable_buf.");
+        return OLAP_ERR_MALLOC_ERROR;
+    }
+    memset(_variable_buf, 0, _variable_len);
+    char* fixed_ptr = _fixed_buf;
+    char* variable_ptr = _variable_buf;
+    for (auto cid : _columns) {
+        fixed_ptr = _fixed_buf + _field_array[cid]->get_offset();
+        FieldType type = tablet_schema[cid].type;
+        if (type == OLAP_FIELD_TYPE_VARCHAR) {
+            StringSlice* slice = reinterpret_cast(fixed_ptr + 1);
+            slice->data = variable_ptr;
+            slice->size = scan_keys[cid].length();
+            variable_ptr += scan_keys[cid].length();
+        } else if (type == OLAP_FIELD_TYPE_CHAR) {
+            StringSlice* slice = reinterpret_cast(fixed_ptr + 1);
+            slice->data = variable_ptr;
+            slice->size = std::max(scan_keys[cid].length(), (size_t)(tablet_schema[cid].length));
+            variable_ptr += slice->size;
+        }
+    }
+
+    return OLAP_SUCCESS;
+}
+
+OLAPStatus RowCursor::allocate_memory_for_string_type(
+        const std::vector& tablet_schema,
+        MemPool* mem_pool) {
+    // allocate memory for string type(char, varchar, hll)
+    // The memory allocated in this function is used in aggregate and copy function
+    if (_variable_len == 0) { return OLAP_SUCCESS; }
+    if (mem_pool != nullptr) {
+        /*
+         * It is called by row_cursor of row_block in sc(schema change)
+         * or be/ce if mem_pool is not null. RowCursor in rowblock do not
+         * allocate memory for string type in initialization. So memory
+         * for string and hllcontext are all administrated by mem_pool.
+         */
+        _variable_buf = reinterpret_cast(mem_pool->allocate(_variable_len));
+        _variable_buf_allocated_by_pool = true;
+    } else {
+        DCHECK(_variable_buf == nullptr) << "allocate memory twice";
+        _variable_buf = new (nothrow) char[_variable_len];
+    }
+    memset(_variable_buf, 0, _variable_len);
+
+    // init slice of char, varchar, hll type
+    char* fixed_ptr = _fixed_buf;
+    char* variable_ptr = _variable_buf;
+    for (auto cid : _columns) {
+        fixed_ptr = _fixed_buf + _field_array[cid]->get_offset();
+        FieldType type = tablet_schema[cid].type;
+        if (type == OLAP_FIELD_TYPE_VARCHAR) {
+            StringSlice* slice = reinterpret_cast(fixed_ptr + 1);
+            slice->data = variable_ptr;
+            slice->size = tablet_schema[cid].length - OLAP_STRING_MAX_BYTES;
+            variable_ptr += slice->size;
+        } else if (type == OLAP_FIELD_TYPE_CHAR) {
+            StringSlice* slice = reinterpret_cast(fixed_ptr + 1);
+            slice->data = variable_ptr;
+            slice->size = tablet_schema[cid].length;
+            variable_ptr += slice->size;
+        } else if (type == OLAP_FIELD_TYPE_HLL) {
+            StringSlice* slice = reinterpret_cast(fixed_ptr + 1);
+            HllContext* context = nullptr;
+            if (mem_pool != nullptr) {
+                context = reinterpret_cast(mem_pool->allocate(sizeof(HllContext)));
+            } else {
+                // store context addr, which will be freed
+                // in deconstructor if allocated by new function
+                context = new HllContext();
+                hll_contexts.push_back(context);
             }
-            field_buf_lens.push_back(std::max(buf_len, (size_t)tablet_schema[i].length));
-        } else {
-            field_buf_lens.push_back(tablet_schema[i].length);
+
+            *(size_t*)(variable_ptr) = (size_t)(context);
+            variable_ptr += sizeof(HllContext*);
+            slice->data = variable_ptr;
+            slice->size = HLL_COLUMN_DEFAULT_LEN;
+            variable_ptr += slice->size;
         }
     }
-    
-    for (size_t i = 0; i < _field_array_size; ++i) {
-        _field_array[i] = NULL;
-        _field_length_array[i] = field_buf_lens[i] + sizeof(char);
-        _field_offset[i] = field_buf_lens[i] + sizeof(char);
-    }
-
-    size_t len = 0;
-    _columns_size = columns.size();
-    for (size_t i = 0; i < _columns_size; ++i) {
-        _field_array[columns[i]] = Field::create(tablet_schema[columns[i]]);
-        if (_field_array[columns[i]] == NULL) {
-            OLAP_LOG_WARNING("Fail to create field.");
-            return OLAP_ERR_INIT_FAILED;
-        }
-        // 判断是å¦éœ€è¦è¿›è¡Œåˆ°MySQL的类型转æ¢
-        if (tablet_schema[columns[i]].type == OLAP_FIELD_TYPE_DISCRETE_DOUBLE) {
-            _is_mysql_compatible = false;
-        }
-
-        if (tablet_schema[columns[i]].type == OLAP_FIELD_TYPE_CHAR
-                  || tablet_schema[columns[i]].type == OLAP_FIELD_TYPE_VARCHAR) {
-            _field_array[columns[i]]->set_buf_size(field_buf_lens[columns[i]]);
-            _field_array[columns[i]]->set_string_length(field_buf_lens[columns[i]]);
-        }   
-        _columns[i] = columns[i];
-        // 计算行长度
-        len += field_buf_lens[columns[i]];
-    }
-
-    // 计算schema当中keyçš„ä¸ªæ•°ï¼Œé¡ºä¾¿æ£€æŸ¥æ˜¯å¦æœ‰value在key之å‰çš„错误
-    _key_column_num = _field_array_size;
-    bool is_last_column_key = true;
-    bool is_current_column_key = false;
-    for (size_t i = 0; i < _field_array_size; ++i) {
-        is_current_column_key = tablet_schema[i].is_key;
-        if (is_last_column_key && !is_current_column_key) {
-            _key_column_num = i;
-        }
-
-        // å‘现有value在keyå‰é¢çš„æƒ…况,则报错
-        if (!is_last_column_key && is_current_column_key) {
-            OLAP_LOG_WARNING("invalid schema format; value column is before key column."
-                             "[column_index=%lu]",
-                             i);
-            return OLAP_ERR_INVALID_SCHEMA;
-        }
-
-        is_last_column_key = is_current_column_key;
-    }
-
-    _length = len + _columns_size;
-    _length_mysql = len;
-    _buf = new (nothrow) char[_length];
-    if (_buf == NULL) {
-        OLAP_LOG_WARNING("Fail to malloc _buf.");
-        return OLAP_ERR_MALLOC_ERROR;
-    }
-    memset(_buf, 0, _length);
-
-    size_t offset = 0;
-    for (size_t i = 0; i < _columns_size; ++i) {
-        _field_array[_columns[i]]->attach_field(_buf + offset);
-        offset += _field_offset[_columns[i]];
-    }
-    _is_inited = true;
     return OLAP_SUCCESS;
 }
 
 int RowCursor::full_key_cmp(const RowCursor& other) const {
-    if (!_is_inited) {
-        OLAP_LOG_FATAL("row curosr is not inited.");
-        return -1;
-    }
-
     // åªæœ‰key columnæ‰ä¼šå‚与比较
     int res = 0;
     for (size_t i = 0; i < _key_column_num; ++i) {
-        if (0 != (res = _field_array[i]->cmp(other._field_array[i]))) {
+        char* left = _field_array[i]->get_field_ptr(_fixed_buf);
+        char* right = other._field_array[i]->get_field_ptr(other.get_buf());
+        res = _field_array[i]->cmp(left, right);
+        if (res != 0) {
             return res;
         }
     }
@@ -246,11 +272,6 @@ int RowCursor::full_key_cmp(const RowCursor& other) const {
 }
 
 int RowCursor::cmp(const RowCursor& other) const {
-    if (!_is_inited) {
-        OLAP_LOG_FATAL("row curosr is not inited.");
-        return -1;
-    }
-
     int res = 0;
     // 两个cursor有å¯èƒ½field个数ä¸åŒï¼Œåªæ¯”较共åŒéƒ¨åˆ†
     size_t common_prefix_count = min(_key_column_num, other._key_column_num);
@@ -260,7 +281,10 @@ int RowCursor::cmp(const RowCursor& other) const {
             continue;
         }
 
-        if (0 != (res = _field_array[i]->cmp(other._field_array[i]))) {
+        char* left = _field_array[i]->get_field_ptr(_fixed_buf);
+        char* right = other._field_array[i]->get_field_ptr(other.get_buf());
+        res = _field_array[i]->cmp(left, right);
+        if (res != 0) {
             return res;
         }
     }
@@ -269,21 +293,18 @@ int RowCursor::cmp(const RowCursor& other) const {
 }
 
 int RowCursor::index_cmp(const RowCursor& other) const {
-    if (!_is_inited) {
-        OLAP_LOG_FATAL("row curosr is not inited.");
-        return -1;
-    }
-
     int res = 0;
     // 两个cursor有å¯èƒ½field个数ä¸åŒï¼Œåªæ¯”较共åŒéƒ¨åˆ†
-    size_t common_prefix_count = min(_columns_size, other._key_column_num);
+    size_t common_prefix_count = min(_columns.size(), other._key_column_num);
     // åªæœ‰key columnæ‰ä¼šå‚与比较
     for (size_t i = 0; i < common_prefix_count; ++i) {
         if (_field_array[i] == NULL || other._field_array[i] == NULL) {
             continue;
         }
-
-        if (0 != (res = _field_array[i]->index_cmp(other._field_array[i]))) {
+        char* left = _field_array[i]->get_field_ptr(_fixed_buf);
+        char* right = other._field_array[i]->get_field_ptr(other.get_buf());
+        res = _field_array[i]->index_cmp(left, right);
+        if (res != 0) {
             return res;
         }
     }
@@ -292,19 +313,15 @@ int RowCursor::index_cmp(const RowCursor& other) const {
 }
 
 bool RowCursor::equal(const RowCursor& other) const {
-    if (!_is_inited) {
-        OLAP_LOG_FATAL("row curosr is not inited.");
-        return false;
-    }
-
     // 按field顺åºä»ŽåŽå¾€å‰æ¯”较,有利于尽快å‘现ä¸åŒï¼Œæå‡æ¯”较性能
     size_t common_prefix_count = min(_key_column_num, other._key_column_num);
     for (int i = common_prefix_count - 1; i >= 0; --i) {
         if (_field_array[i] == NULL || other._field_array[i] == NULL) {
             continue;
         }
-
-        if (false == _field_array[i]->equal(other._field_array[i])) {
+        char* left = _field_array[i]->get_field_ptr(_fixed_buf);
+        char* right = other._field_array[i]->get_field_ptr(other.get_buf());
+        if (!_field_array[i]->equal(left, right)) {
             return false;
         }
     }
@@ -312,323 +329,64 @@ bool RowCursor::equal(const RowCursor& other) const {
 }
 
 void RowCursor::finalize_one_merge() {
-    
-    for (size_t i = _key_column_num; i < _field_array_size; ++i) {
+    for (size_t i = _key_column_num; i < _field_array.size(); ++i) {
         if (_field_array[i] == NULL) {
             continue;
         }
-        if (_field_array[i]->get_aggregation_method() == OLAP_FIELD_AGGREGATION_HLL_UNION) {
-            Field* field = _field_array[i];
-            field->finalize_one_merge();
-        }       
+        char* dest = _field_array[i]->get_ptr(_fixed_buf);
+        _field_array[i]->finalize(dest);
     }
 }
 
-OLAPStatus RowCursor::aggregate(const RowCursor& other) {
-    CHECK_ROWCURSOR_INIT();
-    if (_field_array_size != other._field_array_size) {
-        OLAP_LOG_WARNING("Fail to do aggregate; the two rowcursors do not match."
-                         "[_field_array_size=%lu; other._field_array_size=%lu; "
-                         "_key_column_num=%lu; other._key_column_num=%lu]",
-                         _field_array_size,
-                         other._field_array_size,
-                         _key_column_num,
-                         other._key_column_num);
-        
-        return OLAP_ERR_INPUT_PARAMETER_ERROR;
-    }
-
+void RowCursor::aggregate(const RowCursor& other) {
     // åªæœ‰value columnæ‰ä¼šå‚与aggregate
-    for (size_t i = _key_column_num; i < _field_array_size; ++i) {
+    for (size_t i = _key_column_num; i < _field_array.size(); ++i) {
         if (_field_array[i] == NULL || other._field_array[i] == NULL) {
             continue;
         }
 
-        switch (_field_array[i]->get_aggregation_method()) {
-        case OLAP_FIELD_AGGREGATION_MIN:
-            if (true == is_null(i) && true == other.is_null(i)) {
-                break;
-            } else if (false == is_null(i) && true == other.is_null(i)) {
-                set_null(i);
-                _field_array[i]->copy(other._field_array[i]);
-                _field_length_array[i] = _field_array[i]->field_size();
-                break;
-            } else if (true == is_null(i) && false == other.is_null(i)) {
-                break;
-            } else {
-                _field_array[i]->aggregate(other._field_array[i]);
-                _field_length_array[i] = _field_array[i]->field_size();
-            }
-            break;
-        case OLAP_FIELD_AGGREGATION_MAX:
-        case OLAP_FIELD_AGGREGATION_SUM:
-        case OLAP_FIELD_AGGREGATION_HLL_UNION:
-            if (true == is_null(i) && true == other.is_null(i)) {
-                break;
-            } else if (false == is_null(i) && true == other.is_null(i)) {
-                break;
-            } else if (true == is_null(i) && false == other.is_null(i)) {
-                set_not_null(i);
-                _field_array[i]->copy(other._field_array[i]);
-                _field_length_array[i] = _field_array[i]->field_size();
-            } else {
-                _field_array[i]->aggregate(other._field_array[i]);
-                _field_length_array[i] = _field_array[i]->field_size();
-            }
-            break;
-        case OLAP_FIELD_AGGREGATION_REPLACE:
-            if (true == is_null(i) && true == other.is_null(i)) {
-                break;
-            } else if (false == is_null(i) && true == other.is_null(i)) {
-                set_null(i);
-            } else if (true == is_null(i) && false == other.is_null(i)) {
-                set_not_null(i);
-            }
-            _field_array[i]->aggregate(other._field_array[i]);
-            _field_length_array[i] = _field_array[i]->field_size();
-            break;
-         case OLAP_FIELD_AGGREGATION_NONE:
-         case OLAP_FIELD_AGGREGATION_UNKNOWN:
-         default:
-            break;
-        }
+        char* dest = _field_array[i]->get_field_ptr(_fixed_buf);
+        char* src = other._field_array[i]->get_field_ptr(other.get_buf());
+        _field_array[i]->aggregate(dest, src);
     }
-
-    return OLAP_SUCCESS;
-}
-
-OLAPStatus RowCursor::_write(char* buf, StorageFormatEnum format) const {
-#ifndef PERFORMANCE
-    CHECK_ROWCURSOR_INIT();
-
-    if (buf == NULL) {
-        OLAP_LOG_WARNING("input pointer is NULL. [buf=%p]", buf);
-        return OLAP_ERR_INPUT_PARAMETER_ERROR;
-    }
-#endif
-    size_t offset = 0;
-    for (size_t i = 0; i < _columns_size; ++i) {
-        size_t column_id = _columns[i];
-        size_t field_size = 0;
-
-        if (LOCAL_STORAGE_FORMAT == format) {
-            field_size = _field_array[column_id]->field_size();
-            _field_array[column_id]->to_storage(buf + offset);
-        } else {
-            field_size = _field_array[column_id]->field_size();
-            _field_array[column_id]->to_mysql(buf + offset);
-        }
-        offset += field_size;
-    }
-
-    return OLAP_SUCCESS;
-}
-
-OLAPStatus RowCursor::write(char* buf) const {
-    return _write(buf, LOCAL_STORAGE_FORMAT);
-}
-
-OLAPStatus RowCursor::write_mysql(char* buf) const {
-    return _write(buf, MYSQL_FORMAT);
-}
-
-OLAPStatus RowCursor::write_by_indices_mysql(const vector& indices,
-                                         char* buf,
-                                         size_t buf_size,
-                                         size_t* written_size) const {
-    CHECK_ROWCURSOR_INIT();
-
-    if (buf == NULL || buf_size == 0) {
-        OLAP_LOG_WARNING("input params are invalid. [indices_size=%ld; buf=%p; buf_size=%lu]",
-                         indices.size(),
-                         buf,
-                         buf_size);
-        return OLAP_ERR_INPUT_PARAMETER_ERROR;
-    }
-    for (size_t i = 0; i < indices.size(); ++i) {
-        if (indices[i] > _field_array_size - 1) {
-            OLAP_LOG_WARNING("input indices are invalid."
-                             "[index=%lu; index_value=%d; column_size=%lu]",
-                             i,
-                             indices[i],
-                             _field_array_size);
-            return OLAP_ERR_INPUT_PARAMETER_ERROR;
-        }
-    }
-
-    size_t total_size = 0;
-    for (size_t i = 0; i < indices.size(); ++i) {
-        total_size += _field_array[i]->size();
-    }
-    if (total_size > buf_size) {
-        OLAP_LOG_WARNING("write buffer is not enough. [need_size=%ld; buf_size=%ld]",
-                         total_size,
-                         buf_size);
-        return OLAP_ERR_BUFFER_OVERFLOW;
-    }
-
-    char *curr_buf = buf;
-    for (size_t i = 0, size = indices.size(); i < size; ++i) {
-        size_t current_index = indices[i];
-        _field_array[current_index]->to_mysql(curr_buf);
-        curr_buf += _field_array[current_index]->size();
-    }
-
-    if (written_size) {
-        *written_size = curr_buf - buf;
-    }
-        
-    return OLAP_SUCCESS;
-}
-
-OLAPStatus RowCursor::read(const char* buf, size_t max_buf_len) {
-    CHECK_ROWCURSOR_INIT();
-
-    if (buf == NULL) {
-        OLAP_LOG_WARNING("input pointer is NULL. [buf=%p]", buf);
-        return OLAP_ERR_INPUT_PARAMETER_ERROR;
-    }
-
-    size_t offset = 0;
-    size_t length = 0;
-    
-    for (size_t i = 0; i < _columns_size; ++i) {
-        _field_array[_columns[i]]->from_storage(buf + offset);
-        length = _field_array[_columns[i]]->field_size();
-        offset += length;
-        _field_length_array[_columns[i]] = length;
-
-        if (offset > max_buf_len) {
-            OLAP_LOG_WARNING("buffer overflow. [max_buf_len=%lu offset=%lu]", max_buf_len, offset);
-
-            return OLAP_ERR_BUFFER_OVERFLOW;
-        }
-    }
-
-    return OLAP_SUCCESS;
-}
-
-
-OLAPStatus RowCursor::read_field(const char* buf, size_t index, size_t field_size) {
-    CHECK_ROWCURSOR_INIT();
-
-    if (buf == NULL) {
-        OLAP_LOG_WARNING("input pointer is NULL. [buf=%p]", buf);
-        return OLAP_ERR_INPUT_PARAMETER_ERROR;
-    }
-    _field_array[index]->from_storage(buf);
-    _field_length_array[index] = _field_array[index]->field_size();
-    return OLAP_SUCCESS;
 }
 
 OLAPStatus RowCursor::build_max_key() {
-    CHECK_ROWCURSOR_INIT();
-
-    for (uint32_t i = 0; i < _columns_size; ++i) {
-        _field_array[_columns[i]]->set_to_max();
-        _field_length_array[_columns[i]] = _field_array[_columns[i]]->field_size();
+    for (auto cid : _columns) {
+        Field* field = _field_array[cid];
+        char* dest = field->get_ptr(_fixed_buf);
+        field->set_to_max(dest);
     }
-    
     return OLAP_SUCCESS;
 }
 
 OLAPStatus RowCursor::build_min_key() {
-    CHECK_ROWCURSOR_INIT();
-
-    for (uint32_t i = 0; i < _columns_size; ++i) {
-        _field_array[_columns[i]]->set_to_min();
-        _field_length_array[_columns[i]] = _field_array[_columns[i]]->field_size();
+    for (auto cid : _columns) {
+        Field* field = _field_array[cid];
+        char* dest = field->get_ptr(_fixed_buf);
+        field->set_to_min(dest);
     }
 
     return OLAP_SUCCESS;
 }
 
-OLAPStatus RowCursor::read_by_index(size_t index, const char* buf) {
-    CHECK_ROWCURSOR_INIT();
-
-    if (index >= _field_array_size) {
-        OLAP_LOG_WARNING("index exceeds the max. [index=%lu; max_index=%lu]",
-                         index,
-                         _field_array_size);
-        return OLAP_ERR_INPUT_PARAMETER_ERROR;
-    }
-
-    _field_array[index]->from_storage(buf);
-    _field_length_array[index] = _field_array[index]->field_size();
-    
-    return OLAP_SUCCESS;
-}
-
-OLAPStatus RowCursor::read_by_index(size_t index, const char* buf, int length) {
-    CHECK_ROWCURSOR_INIT();
-
-    if (index >= _field_array_size) {
-        OLAP_LOG_WARNING("index exceeds the max. [index=%lu; max_index=%lu]",
-                         index,
-                         _field_array_size);
-        return OLAP_ERR_INPUT_PARAMETER_ERROR;
-    }
-
-    reinterpret_cast(_field_array[index])->from_storage_length(buf, length);
-    _field_length_array[index] = _field_array[index]->field_size();
-    
-    return OLAP_SUCCESS;
-}
-
-OLAPStatus RowCursor::write_by_index(size_t index, char* buf) const {
-    CHECK_ROWCURSOR_INIT();
-    
-    if (index >= _field_array_size) {
-        OLAP_LOG_WARNING("index exceeds the max. [index=%lu; max_index=%lu]",
-                         index,
-                         _field_array_size);
-        return OLAP_ERR_INPUT_PARAMETER_ERROR;
-    }
-    
-    if (_field_array[index]->is_null()) {
-        buf[0] |= 1;
-    } else {
-        buf[0] &= ~1;
-    }
-    _field_array[index]->to_storage(buf + sizeof(char));
-
-    return OLAP_SUCCESS;
-}
-
-OLAPStatus RowCursor::write_index_by_index(size_t index, char* index_buf) const {
-    CHECK_ROWCURSOR_INIT();
-    
-    if (index >= _field_array_size) {
-        OLAP_LOG_WARNING("index exceeds the max. [index=%lu; max_index=%lu]",
-                         index,
-                         _field_array_size);
-        
-        return OLAP_ERR_INPUT_PARAMETER_ERROR;
-    }
-   
-    _field_array[index]->to_index(index_buf);
-
-    return OLAP_SUCCESS;
-}
-
 OLAPStatus RowCursor::from_string(const vector& val_string_array) {
-    CHECK_ROWCURSOR_INIT();
-    
-    if (val_string_array.size() != _columns_size) {
+    if (val_string_array.size() != _columns.size()) {
         OLAP_LOG_WARNING("column count does not match. [string_array_size=%lu; field_count=%lu]",
                          val_string_array.size(),
-                         _field_array_size);
+                         _field_array.size());
         return OLAP_ERR_INPUT_PARAMETER_ERROR;
     }
 
     for (size_t i = 0; i < val_string_array.size(); ++i) {
-        OLAPStatus res = _field_array[_columns[i]]->from_string(val_string_array[i].c_str());
+        Field* field = _field_array[_columns[i]];
+        char* buf = field->get_ptr(_fixed_buf);
+        OLAPStatus res = field->from_string(buf, val_string_array[i]);
         if (OLAP_SUCCESS != res) {
             OLAP_LOG_WARNING("Fail to convert field from string.[val_string=%s res=%d]", 
                     val_string_array[i].c_str(), res);
             return res;
         }
-        _field_length_array[_columns[i]] = _field_array[_columns[i]]->field_size();
     }
 
     return OLAP_SUCCESS;
@@ -637,9 +395,11 @@ OLAPStatus RowCursor::from_string(const vector& val_string_array) {
 std::vector RowCursor::to_string_vector() const {
     std::vector result;
 
-    for (size_t i = 0; i < _columns_size; ++i) {
-        if (_field_array[_columns[i]] != NULL) {
-            result.push_back(_field_array[_columns[i]]->to_string());
+    for (auto cid : _columns) {
+        if (_field_array[cid] != NULL) {
+            Field* field = _field_array[cid];
+            char* src = field->get_ptr(_fixed_buf);
+            result.push_back(field->to_string(src));
         } else {
             result.push_back("");
         }
@@ -650,41 +410,38 @@ std::vector RowCursor::to_string_vector() const {
 
 string RowCursor::to_string() const {
     string result;
-    for (size_t i = 0; i < _columns_size; ++i) {
-        if (i > 0) {
+    size_t i = 0;
+    for (auto cid : _columns) {
+        if (i++ > 0) {
             result.append("|");
         }
 
-        result.append(std::to_string(_field_array[_columns[i]]->is_null()));
+        Field* field = _field_array[cid];
+        result.append(std::to_string(field->is_null(_fixed_buf)));
         result.append("&");
-        if (_field_array[_columns[i]]->is_null()) {
+        if (field->is_null(_fixed_buf)) {
             result.append("NULL");
         } else {
-            result.append(_field_array[_columns[i]]->to_string());
+            char* src = field->get_ptr(_fixed_buf);
+            result.append(field->to_string(src));
         }
     }
 
     return result;
 }
 
-bool RowCursor::is_null(size_t index) const {
-    return _field_array[index]->is_null();
-}
-
-bool RowCursor::is_null_converted(size_t index) const {
-    size_t column_id = _columns[index];
-    return _field_array[column_id]->is_null();
-}
-
 string RowCursor::to_string(string sep) const {
     string result;
-    for (size_t i = 0; i < _columns_size; ++i) {
-        if (i > 0) {
+    size_t i = 0;
+    for (auto cid : _columns) {
+        if (i++ > 0) {
             result.append(sep);
         }
-        
-        if (_field_array[_columns[i]] != NULL) {
-            result.append(_field_array[_columns[i]]->to_string());
+
+        Field* field = _field_array[cid];
+        if (field != NULL) {
+            char* src = field->get_ptr(_fixed_buf);
+            result.append(field->to_string(src));
         } else {
             result.append("NULL");
         }
@@ -695,25 +452,25 @@ string RowCursor::to_string(string sep) const {
 
 OLAPStatus RowCursor::get_first_different_column_id(const RowCursor& other,
                                                 size_t* first_diff_id) const {
-    CHECK_ROWCURSOR_INIT();
-    
     if (first_diff_id == NULL) {
         OLAP_LOG_WARNING("input parameter 'first_diff_id' is NULL.");
         return OLAP_ERR_INPUT_PARAMETER_ERROR;
     }
 
-    if (_columns_size != other.field_count()) {
+    if (_columns.size() != other.field_count()) {
         OLAP_LOG_WARNING("column number of two cursors do not match.");
         return OLAP_ERR_INPUT_PARAMETER_ERROR;
     }
 
     size_t i = 0;
-    for (; i < _field_array_size; ++i) {
+    for (; i < _field_array.size(); ++i) {
         if (_field_array[i] == NULL || other._field_array[i] == NULL) {
             continue;
         }
-            
-        if (_field_array[i]->cmp(other._field_array[i]) != 0) {
+
+        char* left = _field_array[i]->get_field_ptr(_fixed_buf);
+        char* right = other._field_array[i]->get_field_ptr(other.get_buf());
+        if (0 != (_field_array[i]->cmp(left, right))) {
             break;
         }
     }
diff --git a/be/src/olap/row_cursor.h b/be/src/olap/row_cursor.h
index 8abf31d982..393e45a8b9 100644
--- a/be/src/olap/row_cursor.h
+++ b/be/src/olap/row_cursor.h
@@ -23,42 +23,62 @@
 #include "olap/olap_common.h"
 #include "olap/olap_define.h"
 
-#define CHECK_ROWCURSOR_INIT() \
-    if (!_is_inited) {\
-        OLAP_LOG_FATAL("row cursor is not inited.");\
-        return OLAP_ERR_NOT_INITED;\
-    }
-
 namespace palo {
 class Field;
 
 // 代ç†ä¸€è¡Œæ•°æ®çš„æ“ä½œ
 class RowCursor {
 public:
+    static inline bool equal(const std::vector& ids,
+                             const RowCursor* lhs, const RowCursor* rhs) {
+        for (auto id : ids) {
+            char* left = lhs->get_field_ptr(id);
+            char* right = rhs->get_field_ptr(id);
+            if (!lhs->_field_array[id]->equal(left, right)) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    static inline void aggregate(const std::vector& cids,
+                                 RowCursor* lhs, const RowCursor* rhs) {
+        // åªæœ‰value columnæ‰ä¼šå‚与aggregate
+        for (auto cid : cids) {
+            char* dest = lhs->get_field_ptr(cid);
+            char* src = rhs->get_field_ptr(cid);
+            lhs->_field_array[cid]->aggregate(dest, src);
+        }
+    }
+
     RowCursor();
     
     // é历销æ¯field指针
     ~RowCursor();
     
-    // æ ¹æ®ä¼ å…¥çš„FieldInfo vector,创建所有的field对象
+    // æ ¹æ®ä¼ å…¥schema的创建RowCursor
     OLAPStatus init(const std::vector& tablet_schema);
 
     // æ ¹æ®ä¼ å…¥schemaçš„å‰n列创建RowCursor
-    OLAPStatus init(const std::vector& tablet_schema, size_t column_count);
+    OLAPStatus init(const std::vector& tablet_schema,
+                    size_t column_count);
 
     // æ ¹æ®ä¼ å…¥schemaå’Œcolumn id list创建RowCursor,
     // 用于计算过程åªä½¿ç”¨éƒ¨åˆ†éžå‰ç¼€è¿žç»­åˆ—的场景
-    OLAPStatus init(
-            const std::vector& tablet_schema, const std::vector& columns);
+    OLAPStatus init(const std::vector& tablet_schema,
+                    const std::vector& columns);
 
     // 用传入的keyçš„sizeæ¥åˆå§‹åŒ–
     // ç›®å‰ä»…用在拆分key区间的时候
-    OLAPStatus init_keys(const std::vector& tablet_schema, 
-                    const std::vector& keys);
-  
-    // åŒä¸Šï¼Œç›´æŽ¥ä¼ keysçš„length
-    OLAPStatus init_keys(const std::vector& tablet_schema, 
-                    const std::vector& field_lengths);
+    OLAPStatus init_scan_key(const std::vector& tablet_schema,
+                             const std::vector& keys);
+
+    OLAPStatus init_scan_key(const std::vector& tablet_schema,
+                             const std::vector& field_lengths);
+
+    //allocate memory for string type, which include char, varchar, hyperloglog
+    OLAPStatus allocate_memory_for_string_type(const std::vector& tablet_schema,
+                                               MemPool* mem_pool = nullptr);
  
     // 两个RowCurosråšæ¯”较,返回-1,0,1
     int cmp(const RowCursor& other) const;
@@ -73,75 +93,38 @@ public:
     bool equal(const RowCursor& other) const;
 
     // 两个RowCursoråšç´¯åŠ ï¼Œç»“æžœæ˜¯this += other
-    OLAPStatus aggregate(const RowCursor& other);
+    void aggregate(const RowCursor& other);
 
     // now only used by hll column, do aggregating
     void finalize_one_merge();
-
-    // 当å‰RowCursor内的数æ®ä»¥storageæ ¼å¼é€ä¸ªfield连续输出到一段buf中
-    OLAPStatus write(char* buf) const;
-
-    // 当å‰RowCursor内的数æ®ä»¥mysqlæ ¼å¼é€ä¸ªfield连续输出到一段buf中
-    OLAPStatus write_mysql(char* buf) const;
-
-    // RowCursor从一段连续的bufä¸­è¯»å–æ•°æ®ï¼Œ
-    OLAPStatus read(const char* buf, size_t max_buf_len);
-    // read a field to RowCursor
-    OLAPStatus read_field(const char* buf, size_t index, size_t field_size);
+    inline void finalize_one_merge(const std::vector& ids);
 
     // RowCursor attach到一段连续的buf
-    inline OLAPStatus attach(char* buf, size_t max_buf_len);
-
-    // 输出一列的值到buf
-    OLAPStatus write_by_index(size_t index, char* buf) const;
+    inline void attach(char* buf) { _fixed_buf = buf; }
 
     // 输出一列的index到buf
-    OLAPStatus write_index_by_index(size_t index, char* buf) const;
+    void write_index_by_index(size_t index, char* index_ptr) const {
+        char* src = _field_array[index]->get_field_ptr(_fixed_buf);
+        _field_array[index]->to_index(index_ptr, src);
+    }
 
-    // 按列åºå·è¾“出field的内容,传入vector,一次输出多列
-    OLAPStatus write_by_indices_mysql(const std::vector& indices,
-                                  char* buf,
-                                  size_t buf_size,
-                                  size_t* written_size) const;
+    // set field content without nullbyte
+    void set_field_content(size_t index, const char* buf, MemPool* mem_pool) {
+        char* dest = _field_array[index]->get_ptr(_fixed_buf);
+        _field_array[index]->copy_content(dest, buf, mem_pool);
+    }
 
-    // 把所有Field的格å¼å˜ä¸ºMySQLæ ¼å¼
-    inline void to_mysql();
-
-    // ç›´æŽ¥è¯»å–æŸä¸€ä½ç½®ä¸Šçš„field的内容
-    OLAPStatus read_by_index(size_t index, const char* buf);
-    OLAPStatus read_by_index(size_t index, const char* buf, int length);
-
-    // 直接attachæŸä¸€ä½ç½®ä¸Šçš„field的内容
-    inline OLAPStatus attach_by_index(size_t index, char* buf, bool field_by_buf);
-
-    inline OLAPStatus set_null(size_t index);
-    inline OLAPStatus set_not_null(size_t index);
+    inline void set_null(size_t index) { _field_array[index]->set_null(_fixed_buf); }
+    inline void set_not_null(size_t index) { _field_array[index]->set_not_null(_fixed_buf); } 
 
     // 从传入的字符串数组ååºåˆ—化内部å„field的值
     // æ¯ä¸ªå­—符串必须是一个\0结尾的字符串
     // è¦æ±‚输入字符串和row cursor有相åŒçš„列数,
     OLAPStatus from_string(const std::vector& val_string_arr);
 
-    // å„field存储长度之和
-    size_t length() const {
-        size_t length = 0;
-        for (size_t i = 0; i < _field_array_size; i++) {
-            if (_field_array[i] != NULL) {
-                length += _field_length_array[i];
-            }
-        }
-
-        return length;
-    }
-
     // 返回当å‰row cursor中列的个数
     size_t field_count() const {
-        return _columns_size;
-    }
-
-    // 当å‰Schema是å¦ä¸ŽMySQLæ ¼å¼å…¼å®¹, 以此判断是å¦éœ€è¦æ ¼å¼è½¬æ¢
-    bool is_mysql_compatible() const {
-        return _is_mysql_compatible;
+        return _columns.size();
     }
 
     // 以stringæ ¼å¼è¾“出rowcursor内容,仅供logåŠdebug使用
@@ -150,274 +133,116 @@ public:
     std::vector to_string_vector() const;
 
     // 从å¦å¤–一个RowCursorå¤åˆ¶å®Œæ•´çš„内容,需è¦ä¸¤ä¸ªcursor在字段长度和类型上完全匹é…
-    inline OLAPStatus copy(const RowCursor& other);
-    inline OLAPStatus attach_and_copy(char* buf, const RowCursor& other);
+    inline OLAPStatus copy(const RowCursor& other, MemPool* mem_pool);
+    inline OLAPStatus copy_without_pool(const RowCursor& other);
+    inline OLAPStatus agg_init(const RowCursor& other);
 
     // 比较两个cursor,获å–第一个值ä¸åŒçš„columnçš„id,用于selectivity的计算当中
     OLAPStatus get_first_different_column_id(const RowCursor& other, size_t* first_diff_id) const;
 
     const Field* get_field_by_index(size_t index) const {
-        if (false == _is_inited || index >= _field_array_size) {
-            return NULL;
-        }
-
         return _field_array[index];
     }
 
     bool is_min(size_t index) {
-        return _field_array[index]->is_min();
+        Field* field = _field_array[index];
+        char* src = field->get_ptr(_fixed_buf);
+        return field->is_min(src);
     }
 
-    const size_t get_field_size(size_t index) const {
-        if (false == _is_inited || index >= _field_array_size) {
-            return 0;
-        }
-        
-        return _field_array[index]->field_size();
+    const size_t get_index_size(size_t index) const {
+        return _field_array[index]->index_size();
     }
 
-    Field* get_mutable_field_by_index(size_t index) const {
-        if (false == _is_inited || index >= _field_array_size) {
-            return NULL;
-        }
-
-        if (_field_array[index]->is_null()) {
-            return NULL;
-        }
-
-        return _field_array[index];
-    }
-
-    // 在attach的内存ä½ç½®ç”Ÿæˆæœ€å¤§/最å°key
+    // set max/min for key field in _field_array
     OLAPStatus build_max_key();
     OLAPStatus build_min_key();
 
-    inline OLAPStatus rearrange();
+    inline char* get_buf() const { return _fixed_buf; }
 
-    inline const char* get_buf() const {
-        return _buf;
-    }   
+    // this two functions is used in unit test
+    inline size_t get_fixed_len() const { return _fixed_len; }
+    inline size_t get_variable_len() const { return _variable_len; }
 
-    bool is_null(size_t i) const; //直接给定列在表中的åç§»
-    bool is_null_converted(size_t i) const;//给定查询中的å移,需è¦è½¬æ¢ä¸ºå®žé™…åç§»
-
-    // 用于返回row_cursor实际使用字段的长度之和,å˜é•¿ç±»åž‹æŒ‰æœ€å¤§é•¿åº¦è®¡ç®—
-    // 比如表有10个字段,但查询仅涉åŠå…¶ä¸­5个字段,则返回值为5字段长度之和
-    inline size_t get_buf_len() const {
-        return _length;
-    }    
-
-    // 用于清空row_cursor内部buf
-    inline void reset_buf() {
-        memset(_buf, 0, _length);
+    bool is_null(size_t index) const {
+        return _field_array[index]->is_null(_fixed_buf);
     }
 
-    
-    void get_field_buf_lengths(std::vector* field_lengths) const{
-        if (nullptr == field_lengths) {
-            return;
-        }
-        
-        for (int i = 0; i < _columns_size; i++) {
-            field_lengths->push_back(_field_array[i]->get_buf_size());
-        }
-    }
+    char* get_field_ptr(uint32_t cid) const { return _fixed_buf + _field_offsets[cid]; }
+    char* get_field_content_ptr(uint32_t cid) const { return _fixed_buf + _field_offsets[cid] + 1; }
 
+    inline uint32_t hash_code(uint32_t seed) const;
 private:
-    // 实际的åˆå§‹åŒ–函数
-    OLAPStatus _init(const std::vector& tablet_schema, const std::vector& columns, 
-                      const std::vector* field_lengths);
+    // common init function
+    OLAPStatus _init(const std::vector& tablet_schema,
+                     const std::vector& columns);
 
-    typedef Field** field_array_t;
+    std::vector _field_array;    // store point array of field
+    std::vector _field_offsets;  // field offset in _fixed_buf
 
-    enum StorageFormatEnum {
-        LOCAL_STORAGE_FORMAT = 0,
-        MYSQL_FORMAT = 1,
-    };
+    size_t _key_column_num;              // key num in row_cursor
 
-    OLAPStatus _write(char* buf, StorageFormatEnum format) const;
+    std::vector _columns;      // column_id in schema
+    char* _fixed_buf = nullptr;          // point to fixed buf
+    size_t _fixed_len;
+    char* _owned_fixed_buf = nullptr;    // point to buf allocated in init function
 
-    field_array_t _field_array;        // 内部ä¿å­˜field指针的数组
-    size_t _null_byte_num;
-    size_t _field_array_size;          // field指针数组的长度
-    uint32_t* _columns;                // 部分列æ“作模å¼ä¸‹çš„列åºå·æ•°ç»„
-    size_t _columns_size;
-    size_t _key_column_num;            // 一行中å‰å¤šå°‘个column是key
-    size_t _length;
-    size_t _length_mysql;
-    size_t* _field_length_array;       // 记录æ¯ä¸€ä¸ªfieldçš„storageæ ¼å¼é•¿åº¦
-    size_t* _field_offset;
-    bool _is_inited;                   // åˆå§‹åŒ–标记
-    char* _buf;                        // Field使用的buf
-    bool _is_mysql_compatible;
+    char* _variable_buf = nullptr;
+    size_t _variable_len;
+    bool _variable_buf_allocated_by_pool;
+    std::vector hll_contexts;
 
     DISALLOW_COPY_AND_ASSIGN(RowCursor);
 };
 
-inline OLAPStatus RowCursor::attach(char* buf, size_t max_buf_len) {
-#ifndef PERFORMANCE
-    CHECK_ROWCURSOR_INIT();
-
-    if (buf == NULL) {
-        OLAP_LOG_WARNING("input pointer is NULL. [buf=%p]", buf);
-        return OLAP_ERR_INPUT_PARAMETER_ERROR;
-    }
-
-#endif
-    
-    size_t offset = 0;
-    size_t length = 0;
-    for (size_t i = 0; i < _columns_size; ++i) {
-        _field_array[_columns[i]]->attach_field(buf + offset);
-        length = _field_array[_columns[i]]->field_size();
-        offset += length;
-        _field_length_array[_columns[i]] = length;
-#ifndef PERFORMANCE
-        if (offset > max_buf_len) {
-            OLAP_LOG_WARNING("buffer overflow. [max_buf_len=%lu offset=%lu]", max_buf_len, offset);
-            return OLAP_ERR_BUFFER_OVERFLOW;
-        }
-
-#endif
-    }
-
-    return OLAP_SUCCESS;
-}
-
-void RowCursor::to_mysql() {
-    for (size_t i = 0; i < _columns_size; ++i) {
-        _field_array[_columns[i]]->to_mysql();
-    }
-}
-
-inline OLAPStatus RowCursor::set_null(size_t index) {
-    _field_array[index]->set_null();
-    return OLAP_SUCCESS;
-}
-
-inline OLAPStatus RowCursor::set_not_null(size_t index) {
-    _field_array[index]->set_not_null();
-    return OLAP_SUCCESS;
-}
-
-inline OLAPStatus RowCursor::attach_by_index(size_t index, char* buf, bool field_or_buf) {
-#ifndef PERFORMANCE
-    CHECK_ROWCURSOR_INIT();
-
-    if (index >= _field_array_size) {
-        OLAP_LOG_WARNING("index exceeds the max. [index=%lu max_index=%lu]",
-                         index,
-                         _field_array_size);
-        return OLAP_ERR_INPUT_PARAMETER_ERROR;
-    }
-
-#endif
-
-    if (true == field_or_buf) {
-        _field_array[index]->attach_field(buf);
-    } else {
-        _field_array[index]->attach_buf(buf);
-    }
-    _field_length_array[index] = _field_array[index]->field_size();
-
-    return OLAP_SUCCESS;
-}
-
 // 主è¦ç”¨äºŽmerge
-inline OLAPStatus RowCursor::copy(const RowCursor& other) {
-    CHECK_ROWCURSOR_INIT();
-    reset_buf();
-
-    for (size_t i = 0; i < _columns_size; ++i) {
-        uint32_t column_id = _columns[i];
-#ifndef PERFORMANCE
-        if (column_id >= other._field_array_size || other._field_array[column_id] == NULL) {
-            OLAP_LOG_WARNING("two cursor not match.");
-            return OLAP_ERR_INPUT_PARAMETER_ERROR;
-        }
-
-#endif
-        _field_array[column_id]->copy(other._field_array[column_id]);
-        // 用于merge,merge的时候其实是ä¸éœ€è¦è€ƒè™‘行长,因为写数æ®ä¸æŒ‰è¡Œé•¿å†™ã€‚æ›´æ–°ä¸æ›´æ–°éƒ½è¡Œ
-        _field_length_array[column_id] = other._field_array[column_id]->field_size();
+inline OLAPStatus RowCursor::copy(const RowCursor& other, MemPool* mem_pool) {
+    for (auto cid : _columns) {
+        Field* field = _field_array[cid];
+        char* dest = get_field_ptr(cid);
+        char* src = other.get_field_ptr(cid);
+        field->copy_with_pool(dest, src, mem_pool);
     }
 
     return OLAP_SUCCESS;
 }
 
-// 从otheræ‹·è´åˆ°buf中。此函数仅用在读数æ®çš„æ—¶å€™ï¼Œæ­¤å¤„存在一个问题,
-// 当varchar字段èšåˆæ—¶ï¼Œsrcå¯èƒ½æ¯”dst长,所以在copy的时候,ä¸èƒ½ç”¨è‡´å¯†çš„æŽ’列
-// æ–¹å¼ã€‚åªèƒ½æ‹·è´å®Œä¹‹åŽå†æ•´ç†å†…å­˜
-inline OLAPStatus RowCursor::attach_and_copy(char* buf, const RowCursor& other) {
-    CHECK_ROWCURSOR_INIT();
-
-    size_t offset = 0;
-
-    for (size_t i = 0; i < _columns_size; ++i) {
-#ifndef PERFORMANCE
-        if (_field_array[_columns[i]] == NULL
-                || _columns[i] >= other._field_array_size
-                || other._field_array[_columns[i]] == NULL) {
-            return OLAP_ERR_INPUT_PARAMETER_ERROR;
-        }
-
-#endif
-        size_t field_size = other._field_array[_columns[i]]->field_size();
-
-        // XXX(fulili) 为了实现快速copyï¼Œéœ€è¦æŒ‡å‘çš„buf至少有é¢å¤–8字节的空间
-        if (OLAP_LIKELY(field_size <= 8)) {
-            memcpy(buf + offset, other._field_array[_columns[i]]->get_null(), 8);
-        } else {
-            memcpy(buf + offset, other._field_array[_columns[i]]->get_null(), field_size);
-        }
-
-        _field_array[_columns[i]]->attach_field(buf + offset);
-        // æ›´æ–°ä¸€ä¸‹è¡Œé•¿ï¼Œæ¯æ¬¡attach或是from_storage之åŽéƒ½éœ€è¦æ›´æ–°
-        _field_length_array[_columns[i]] = field_size;
-        //这里的offset 是最大值,此函数åªç”¨äºŽæŸ¥è¯¢ï¼Œå¾€buffer里顺åºå†™è¡Œï¼Œéœ€è¦ç®—åç§»
-        //但由于å˜é•¿å­—符串èšåˆçš„问题,所以ä¸ç”¨å®žé™…长度了,èšåˆåŽè°ƒç”¨å‡½æ•°ç´§å‡‘下,
-        //ä¸»è¦æ˜¯é˜²æ­¢åœ¨å†™å…¥åˆ°è°ƒç”¨å‡½æ•°é‡æ‹å†…存之间,会得到错误的长度
-        //所以这里使用offset。
-        offset += _field_offset[_columns[i]];
+inline OLAPStatus RowCursor::copy_without_pool(const RowCursor& other) {
+    for (auto cid : _columns) {
+        Field* field = _field_array[cid];
+        char* dest = get_field_ptr(cid);
+        char* src = other.get_field_ptr(cid);
+        field->copy_without_pool(dest, src);
     }
 
     return OLAP_SUCCESS;
 }
 
-inline OLAPStatus RowCursor::rearrange() {
-    size_t dst_offset = 0;
-    size_t src_offset = 0;
-    size_t field_length = 0;
-
-    for (size_t i = 0; i < _columns_size; ++i) {
-        field_length = _field_array[_columns[i]]->field_size();
-
-        if (dst_offset != src_offset) {
-            // dst_offset - src_offset结果为0或负数,所以此æ“作是将bufferå‰ç§»
-            memmove(_field_array[_columns[i]]->get_null() + (dst_offset - src_offset),
-                    _field_array[_columns[i]]->get_null(),
-                    field_length);
-        }
-
-        // offset 数组中ä¿å­˜çš„æ˜¯æœ€å¤§å移,且ä¸ä¼šå˜
-        // field_lengthé•¿åº¦åªæœ‰å˜é•¿å­—符串会ä¸åŒ
-        src_offset += _field_offset[_columns[i]];
-        dst_offset += field_length;
+inline OLAPStatus RowCursor::agg_init(const RowCursor& other) {
+    for (auto cid : _columns) {
+        Field* field = _field_array[cid];
+        char* dest = get_field_ptr(cid);
+        char* src = other.get_field_ptr(cid);
+        field->agg_init(dest, src);
     }
 
     return OLAP_SUCCESS;
 }
 
-// RowCurosr 全key进行比较
-class RowCursorComparator {
-public:
-    RowCursorComparator() {}
-    virtual ~RowCursorComparator() {}
-
-    bool operator()(const RowCursor* a, const RowCursor* b) const {
-        return a->full_key_cmp(*b) < 0;
+inline void RowCursor::finalize_one_merge(const std::vector& ids) {
+    for (uint32_t id : ids) {
+        char* dest = _field_array[id]->get_ptr(_fixed_buf);
+        _field_array[id]->finalize(dest);
     }
-};
+}
+
+inline uint32_t RowCursor::hash_code(uint32_t seed) const {
+    for (auto cid : _columns) {
+        char* dest = _field_array[cid]->get_field_ptr(_fixed_buf);
+        seed = _field_array[cid]->hash_code(dest, seed);
+    }
+    return seed;
+}
 
 }  // namespace palo
 
diff --git a/be/src/olap/schema_change.cpp b/be/src/olap/schema_change.cpp
index 9024e5f0b9..ab7643d205 100644
--- a/be/src/olap/schema_change.cpp
+++ b/be/src/olap/schema_change.cpp
@@ -21,7 +21,6 @@
 #include 
 #include 
 
-#include "olap/field.h"
 #include "olap/i_data.h"
 #include "olap/merger.h"
 #include "olap/olap_data.h"
@@ -30,6 +29,7 @@
 #include "olap/row_block.h"
 #include "olap/row_cursor.h"
 #include "olap/writer.h"
+#include "olap/wrapper_field.h"
 #include "common/resource_tls.h"
 #include "agent/cgroups_mgr.h"
 
@@ -82,62 +82,30 @@ ColumnMapping* RowBlockChanger::get_mutable_column_mapping(size_t column_index)
     return &(_schema_mapping[column_index]);
 }
 
-#define TYPE_REINTERPRET_CAST(from_type, to_type) \
+#define TYPE_REINTERPRET_CAST(FromType, ToType) \
 { \
-    char* ref_pos = ref_block._buf + ref_block._grid_items[ref_column].offset; \
-    char* new_pos = mutable_block->_buf + mutable_block->_grid_items[i].offset; \
-    for (size_t row = 0, row_num = ref_block.row_block_info().row_num; \
-            row < row_num; ++row) { \
+    size_t row_num = ref_block.row_block_info().row_num; \
+    for (size_t row = 0, mutable_row = 0; row < row_num; ++row) { \
         if (is_data_left_vec[row] != 0) { \
-            from_type* ref_offset = NULL;\
-            if (OLAP_DATA_FILE == ref_block._data_file_type) {\
-                if (false == ref_block._null_supported) {\
-                    ref_offset = reinterpret_cast( \
-                        ref_pos + row * ref_block._grid_items[ref_column].width); \
-                } else {\
-                    *new_pos = *(ref_pos + row * ref_block._grid_items[ref_column].width); \
-                    ref_offset = reinterpret_cast( \
-                        ref_pos + row * ref_block._grid_items[ref_column].width + 1); \
-                }\
-            } else if (COLUMN_ORIENTED_FILE == ref_block._data_file_type) {\
-                *new_pos = *(ref_pos + row * ref_block._grid_items[ref_column].width); \
-                ref_offset = reinterpret_cast( \
-                    ref_pos + row * ref_block._grid_items[ref_column].width + 1); \
-            }\
-            to_type *new_offset = reinterpret_cast(new_pos + 1);\
-            *(new_offset) = static_cast(*ref_offset); \
-            new_pos += mutable_block->_grid_items[i].width;\
+            char* ref_ptr = ref_block.field_ptr(row, ref_column); \
+            char* new_ptr = mutable_block->field_ptr(mutable_row++, i); \
+            *new_ptr = *ref_ptr; \
+            *(ToType*)(new_ptr + 1) = *(FromType*)(ref_ptr + 1); \
         } \
     } \
     break; \
 }
 
-#define LARGEINT_REINTERPRET_CAST(from_type, to_type) \
+#define LARGEINT_REINTERPRET_CAST(FromType, ToType) \
 { \
-    char* ref_pos = ref_block._buf + ref_block._grid_items[ref_column].offset; \
-    char* new_pos = mutable_block->_buf + mutable_block->_grid_items[i].offset; \
-    for (size_t row = 0, row_num = ref_block.row_block_info().row_num; \
-         row < row_num; ++row) { \
+    size_t row_num = ref_block.row_block_info().row_num; \
+    for (size_t row = 0, mutable_row = 0; row < row_num; ++row) { \
         if (is_data_left_vec[row] != 0) { \
-            from_type *ref_offset = NULL;\
-            if (OLAP_DATA_FILE == ref_block._data_file_type) {\
-                if (false == ref_block._null_supported) {\
-                    ref_offset = reinterpret_cast( \
-                        ref_pos + row * ref_block._grid_items[ref_column].width); \
-                } else { \
-                    *new_pos = *(ref_pos + row * ref_block._grid_items[ref_column].width); \
-                    ref_offset = reinterpret_cast( \
-                        ref_pos + row * ref_block._grid_items[ref_column].width + 1); \
-                } \
-            } else if (COLUMN_ORIENTED_FILE == ref_block._data_file_type) {\
-                *new_pos = *(ref_pos + row * ref_block._grid_items[ref_column].width); \
-                ref_offset = reinterpret_cast( \
-                    ref_pos + row * ref_block._grid_items[ref_column].width + 1); \
-            } \
-            to_type *new_offset = reinterpret_cast(new_pos + 1);\
-            to_type temp = static_cast(*ref_offset); \
-            *(new_offset) = temp; \
-            new_pos += mutable_block->_grid_items[i].width;\
+            char* ref_ptr = ref_block.field_ptr(row, ref_column); \
+            char* new_ptr = mutable_block->field_ptr(mutable_row++, i); \
+            *new_ptr = *ref_ptr; \
+            ToType new_value = *(FromType*)(ref_ptr + 1); \
+            memcpy(new_ptr + 1, &new_value, sizeof(ToType)); \
         } \
     } \
     break; \
@@ -179,7 +147,7 @@ ColumnMapping* RowBlockChanger::get_mutable_column_mapping(size_t column_index)
 #define ASSIGN_DEFAULT_VALUE(length) \
     case length: { \
         for (size_t row = 0; row < ref_block.row_block_info().row_num; ++row) { \
-            memcpy(buf, _schema_mapping[i].default_value->buf(), length); \
+            memcpy(buf, _schema_mapping[i].default_value->ptr(), length); \
             buf += length; \
         } \
         break; \
@@ -191,12 +159,9 @@ bool RowBlockChanger::change_row_block(
         int32_t data_version,
         RowBlock* mutable_block,
         uint64_t* filted_rows) const {
-    if (mutable_block == NULL || !mutable_block->_is_inited) {
+    if (mutable_block == NULL) {
         OLAP_LOG_FATAL("mutable block is uninitialized.");
         return false;
-    } else if (!ref_block._is_inited) {
-        OLAP_LOG_WARNING("the row block referenced is uninited.");
-        return false;
     } else if (mutable_block->_tablet_schema.size() != _schema_mapping.size()) {
         OLAP_LOG_WARNING("mutable block does not match with schema mapping rules. "
                          "[block_schema_size=%ld, mapping_schema_size=%ld]",
@@ -205,19 +170,14 @@ bool RowBlockChanger::change_row_block(
         return false;
     }
 
-    if (mutable_block->allocated_row_num() < ref_block.row_block_info().row_num) {
+    if (mutable_block->capacity() < ref_block.row_block_info().row_num) {
         OLAP_LOG_WARNING("mutable block is not large enough for storing the changed block. "
                          "[mutable_block_size=%ld, ref_block_size=%u]",
-                         mutable_block->allocated_row_num(),
+                         mutable_block->capacity(),
                          ref_block.row_block_info().row_num);
         return false;
     }
 
-    if (!ref_block._is_inited) {
-        OLAP_LOG_WARNING("the row block referenced is uninited.");
-        return false;
-    }
-
     mutable_block->clear();
 
     RowCursor write_helper;
@@ -240,17 +200,12 @@ bool RowBlockChanger::change_row_block(
 
     // 一行一行地进行比较
     for (size_t row_index = 0; row_index < row_num; ++row_index) {
-        if (ref_block.get_row_to_read(row_index, &read_helper) != OLAP_SUCCESS) {
-            OLAP_LOG_WARNING("fail to get row to read");
-            return false;
-        }
+        ref_block.get_row(row_index, &read_helper);
 
         // filter data according to delete conditions specified in DeleteData command
-        if (df_type == OLAP_DATA_FILE) {
-            if (is_data_left_vec[row_index] == 1) {
-                if (_delete_handler.is_filter_data(data_version, read_helper)) {
-                    is_data_left_vec[row_index] = 0;
-                }
+        if (is_data_left_vec[row_index] == 1) {
+            if (_delete_handler.is_filter_data(data_version, read_helper)) {
+                is_data_left_vec[row_index] = 0;
             }
         }
     }
@@ -267,6 +222,7 @@ bool RowBlockChanger::change_row_block(
     const bool need_filter_data = (new_row_num != row_num);
     const bool filter_all = (new_row_num == 0);
 
+    MemPool* mem_pool = mutable_block->mem_pool();
     // b. æ ¹æ®å‰é¢çš„过滤信æ¯ï¼Œåªå¯¹è¿˜æ ‡è®°ä¸º1的处ç†
     for (size_t i = 0, len = mutable_block->tablet_schema().size(); !filter_all && i < len; ++i) {
         int32_t ref_column = _schema_mapping[i].ref_column;
@@ -285,32 +241,21 @@ bool RowBlockChanger::change_row_block(
                     }
 
                     // 指定新的è¦å†™å…¥çš„row index(ä¸åŒäºŽè¯»çš„row_index)
-                    if (mutable_block->get_row_to_write(new_row_index++, &write_helper)
-                            != OLAP_SUCCESS) {
-                        OLAP_LOG_WARNING("fail to get row to write");
-                        return false;
-                    }
-
-                    if (ref_block.get_row_to_read(row_index, &read_helper) != OLAP_SUCCESS) {
-                        OLAP_LOG_WARNING("fail to get row to read");
-                        return false;
-                    }
+                    mutable_block->get_row(new_row_index++, &write_helper);
+                    ref_block.get_row(row_index, &read_helper);
 
                     if (true == read_helper.is_null(ref_column)) {
                         write_helper.set_null(i);
                     } else {
-                        // è¦å†™å…¥çš„
                         const Field* field_to_read = read_helper.get_field_by_index(ref_column);
                         if (NULL == field_to_read) {
                             OLAP_LOG_WARNING("faile to get ref field.[index=%d]", ref_column);
                             return false;
                         }
-
+                        
                         write_helper.set_not_null(i);
-                        if (write_helper.read_by_index(i, field_to_read->buf()) != OLAP_SUCCESS) {
-                            OLAP_LOG_WARNING("faile to read field");
-                            return false;
-                        }
+                        char* buf = field_to_read->get_ptr(read_helper.get_buf());
+                        write_helper.set_field_content(i, buf, mem_pool);
                     }
                 }
 
@@ -326,16 +271,9 @@ bool RowBlockChanger::change_row_block(
                     }
 
                     // 指定新的è¦å†™å…¥çš„row index(ä¸åŒäºŽè¯»çš„row_index)
-                    if (mutable_block->get_row_to_write(new_row_index++, &write_helper)
-                            != OLAP_SUCCESS) {
-                        OLAP_LOG_WARNING("fail to get row to write");
-                        return false;
-                    }
+                    mutable_block->get_row(new_row_index++, &write_helper);
 
-                    if (ref_block.get_row_to_read(row_index, &read_helper) != OLAP_SUCCESS) {
-                        OLAP_LOG_WARNING("fail to get row to read");
-                        return false;
-                    }
+                    ref_block.get_row(row_index, &read_helper);
 
                     if (true == read_helper.is_null(ref_column)) {
                         write_helper.set_null(i);
@@ -349,15 +287,13 @@ bool RowBlockChanger::change_row_block(
 
                         write_helper.set_not_null(i);
                         int p = ref_block.tablet_schema()[ref_column].length - 1;
-                        char* buf = field_to_read->buf();
+                        StringSlice* slice = reinterpret_cast(field_to_read->get_ptr(read_helper.get_buf()));
+                        char* buf = slice->data;
                         while (p >= 0 && buf[p] == '\0') {
                             p--;
                         }
-                        if (write_helper.read_by_index(
-                                i, field_to_read->buf(), p + 1) != OLAP_SUCCESS) {
-                            OLAP_LOG_WARNING("faile to read field");
-                            return false;
-                        }
+                        slice->size = p + 1;
+                        write_helper.set_field_content(i, buf, mem_pool);
                     }
                 }
 
@@ -407,17 +343,14 @@ bool RowBlockChanger::change_row_block(
                     continue;
                 }
 
-                if (mutable_block->get_row_to_write(new_row_index++, &write_helper)
-                        != OLAP_SUCCESS) {
-                    OLAP_LOG_WARNING("fail to get row to write");
-                    return false;
-                }
+                mutable_block->get_row(new_row_index++, &write_helper);
 
                 if (_schema_mapping[i].default_value->is_null()) {
                     write_helper.set_null(i);
                 } else {
                     write_helper.set_not_null(i);
-                    write_helper.read_by_index(i, _schema_mapping[i].default_value->buf());
+                    write_helper.set_field_content(
+                        i, _schema_mapping[i].default_value->ptr(), mem_pool);
                 }
             }
         }
@@ -451,7 +384,7 @@ bool RowBlockSorter::sort(RowBlock** row_block) {
     DataFileType data_file_type = (*row_block)->row_block_info().data_file_type;
     bool null_supported = (*row_block)->row_block_info().null_supported;
 
-    if (_swap_row_block == NULL || _swap_row_block->allocated_row_num() < row_num) {
+    if (_swap_row_block == NULL || _swap_row_block->capacity() < row_num) {
         if (_swap_row_block != NULL) {
             _row_block_allocator->release(_swap_row_block);
             _swap_row_block = NULL;
@@ -465,6 +398,13 @@ bool RowBlockSorter::sort(RowBlock** row_block) {
         }
     }
 
+    RowCursor helper_row;
+    auto res = helper_row.init(_swap_row_block->tablet_schema());
+    if (res != OLAP_SUCCESS) {
+        LOG(WARNING) << "row cursor init failed.res:" << res;
+        return false;
+    }
+
     RowBlock* temp = NULL;
     vector row_cursor_list((*row_block)->row_block_info().row_num, NULL);
 
@@ -479,9 +419,7 @@ bool RowBlockSorter::sort(RowBlock** row_block) {
             goto SORT_ERR_EXIT;
         }
 
-        if ((*row_block)->get_row_to_read(i, row_cursor_list[i]) != OLAP_SUCCESS) {
-            goto SORT_ERR_EXIT;
-        }
+        (*row_block)->get_row(i, row_cursor_list[i]);
     }
 
     // Must use 'std::' because this class has a function whose name is sort too
@@ -489,9 +427,9 @@ bool RowBlockSorter::sort(RowBlock** row_block) {
 
     // copy the results sorted to temp row block.
     _swap_row_block->clear();
-
     for (size_t i = 0; i < row_cursor_list.size(); ++i) {
-        if (_swap_row_block->set_row(i, (*row_cursor_list[i])) != OLAP_SUCCESS) {
+        _swap_row_block->get_row(i, &helper_row);
+        if (helper_row.copy(*row_cursor_list[i], _swap_row_block->mem_pool()) != OLAP_SUCCESS) {
             OLAP_LOG_WARNING("failed to set row for row block. [row=%ld]", i);
             goto SORT_ERR_EXIT;
         }
@@ -588,12 +526,12 @@ void RowBlockAllocator::release(RowBlock* row_block) {
         return;
     }
 
-    _memory_allocated -= row_block->allocated_row_num() * _row_len;
+    _memory_allocated -= row_block->capacity() * _row_len;
 
     OLAP_LOG_DEBUG("RowBlockAllocator::release() "
                    "[this=%p num_rows=%ld m_memory_allocated=%ld p=%p]",
                    this,
-                   row_block->allocated_row_num(),
+                   row_block->capacity(),
                    _memory_allocated,
                    row_block);
     delete row_block;
@@ -609,6 +547,8 @@ bool RowBlockMerger::merge(
         uint64_t* merged_rows) {
     uint64_t tmp_merged_rows = 0;
     RowCursor row_cursor;
+    MemPool* mem_pool = writer->mem_pool();
+
     if (row_cursor.init(_olap_table->tablet_schema()) != OLAP_SUCCESS) {
         OLAP_LOG_WARNING("fail to init row cursor.");
         goto MERGE_ERR;
@@ -616,13 +556,19 @@ bool RowBlockMerger::merge(
 
     _make_heap(row_block_arr);
 
+    // TODO: for now, string type in rowblock is not allocated
+    // memory during init procedure. So, copying content
+    // in row_cursor to rowblock is necessary
+    // That's not very memory-efficient!
+
     while (_heap.size() > 0) {
         if (writer->attached_by(&row_cursor) != OLAP_SUCCESS) {
             OLAP_LOG_WARNING("writer error.");
             goto MERGE_ERR;
         }
+        row_cursor.allocate_memory_for_string_type(_olap_table->tablet_schema(), mem_pool);
 
-        row_cursor.copy(*(_heap.top().row_cursor));
+        row_cursor.agg_init(*(_heap.top().row_cursor));
 
         if (!_pop_heap()) {
             goto MERGE_ERR;
@@ -680,13 +626,7 @@ bool RowBlockMerger::_make_heap(const vector& row_block_arr) {
             return false;
         }
 
-        if (OLAP_SUCCESS != element.row_block->get_row_to_read(element.row_block_index,
-                                                               element.row_cursor)) {
-            OLAP_LOG_WARNING("failed to get row from row block. [row_block_index=%d]",
-                             element.row_block_index);
-            SAFE_DELETE(element.row_cursor);
-            return false;
-        }
+        element.row_block->get_row(element.row_block_index, element.row_cursor);
 
         _heap.push(element);
     }
@@ -703,12 +643,7 @@ bool RowBlockMerger::_pop_heap() {
         return true;
     }
 
-    if (OLAP_SUCCESS != element.row_block->get_row_to_read(element.row_block_index,
-                                                           element.row_cursor)) {
-        OLAP_LOG_WARNING("failed to get row from row block. [row_block_index=%d]",
-                         element.row_block_index);
-        return false;
-    }
+    element.row_block->get_row(element.row_block_index, element.row_cursor);
 
     _heap.push(element);
     return true;
@@ -736,18 +671,16 @@ SchemaChangeDirectly::~SchemaChangeDirectly() {
 }
 
 bool SchemaChangeDirectly::_write_row_block(IWriter* writer, RowBlock* row_block) {
+    MemPool* mem_pool = writer->mem_pool();
     for (uint32_t i = 0; i < row_block->row_block_info().row_num; i++) {
         if (OLAP_SUCCESS != writer->attached_by(_dst_cursor)) {
             OLAP_LOG_WARNING("fail to attach writer");
             return false;
         }
 
-        if (OLAP_SUCCESS != row_block->get_row_to_read(i, _src_cursor)) {
-            OLAP_LOG_WARNING("fail to get row from row block.");
-            return false;
-        }
+        row_block->get_row(i, _src_cursor);
 
-        _dst_cursor->copy(*_src_cursor);
+        _dst_cursor->copy(*_src_cursor, mem_pool);
         writer->next(*_dst_cursor);
     }
 
@@ -885,7 +818,7 @@ bool SchemaChangeDirectly::process(IData* olap_data, OLAPIndex* new_olap_index)
     while (NULL != ref_row_block) {
         // 注æ„这里强制分é…和旧å—等大的å—(å°äº†å¯èƒ½ä¼šå­˜ä¸ä¸‹)
         if (NULL == new_row_block
-                || new_row_block->allocated_row_num() < ref_row_block->row_block_info().row_num) {
+                || new_row_block->capacity() < ref_row_block->row_block_info().row_num) {
             if (NULL != new_row_block) {
                 _row_block_allocator->release(new_row_block);
                 new_row_block = NULL;
@@ -940,10 +873,7 @@ bool SchemaChangeDirectly::process(IData* olap_data, OLAPIndex* new_olap_index)
         goto DIRECTLY_PROCESS_ERR;
     }
 
-    if (olap_data->data_file_type() == COLUMN_ORIENTED_FILE) {
-        reset_filted_rows();
-        add_filted_rows(olap_data->get_filted_rows());
-    }
+    add_filted_rows(olap_data->get_filted_rows());
 
     // Check row num changes
     if (config::row_nums_check) {
@@ -1146,10 +1076,7 @@ bool SchemaChangeWithSorting::process(IData* olap_data, OLAPIndex* new_olap_inde
         goto SORTING_PROCESS_ERR;
     }
 
-    if (olap_data->data_file_type() == COLUMN_ORIENTED_FILE) {
-        reset_filted_rows();
-        add_filted_rows(olap_data->get_filted_rows());
-    }
+    add_filted_rows(olap_data->get_filted_rows());
 
     // Check row num changes
     if (config::row_nums_check) {
@@ -2342,17 +2269,12 @@ OLAPStatus SchemaChangeHandler::_parse_request(SmartOLAPTable ref_olap_table,
 OLAPStatus SchemaChangeHandler::_init_column_mapping(ColumnMapping* column_mapping,
                                                      const FieldInfo& column_schema,
                                                      const std::string& value) {
-    column_mapping->default_value = Field::create(column_schema);
+    column_mapping->default_value = WrapperField::create(column_schema);
 
     if (column_mapping->default_value == NULL) {
         return OLAP_ERR_MALLOC_ERROR;
     }
 
-    if (!column_mapping->default_value->allocate()) {
-        OLAP_LOG_WARNING("failed to init Field. [column='%s']", column_schema.name.c_str());
-        return OLAP_ERR_CE_CMD_PARAMS_ERROR;
-    }
-
     if (true == column_schema.is_allow_null && value.length() == 0) {
         column_mapping->default_value->set_null();
     } else {
diff --git a/be/src/olap/schema_change.h b/be/src/olap/schema_change.h
index 7d123820fe..a9dd807c7a 100644
--- a/be/src/olap/schema_change.h
+++ b/be/src/olap/schema_change.h
@@ -47,7 +47,7 @@ struct ColumnMapping {
     // >=0: use origin column
     int32_t ref_column;
     // normally for default value. stores values for filters
-    Field* default_value;
+    WrapperField* default_value;
 };
 
 class RowBlockChanger {
diff --git a/be/src/olap/string_slice.h b/be/src/olap/string_slice.h
new file mode 100644
index 0000000000..a602a3fc79
--- /dev/null
+++ b/be/src/olap/string_slice.h
@@ -0,0 +1,213 @@
+// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef BDG_PALO_BE_SRC_OLAP_STRING_SLICE_H
+#define BDG_PALO_BE_SRC_OLAP_STRING_SLICE_H
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "olap/olap_define.h"
+
+namespace palo {
+
+/// @brief A wrapper around externally allocated data.
+///
+/// StringSlice is a simple structure containing a pointer into some external
+/// storage and a size. The user of a StringSlice must ensure that the slice
+/// is not used after the corresponding external storage has been
+/// deallocated.
+///
+/// Multiple threads can invoke const methods on a StringSlice without
+/// external synchronization, but if any of the threads may call a
+/// non-const method, all threads accessing the same StringSlice must use
+/// external synchronization.
+struct StringSlice {
+public:
+    char* data;
+    size_t size;
+    // Intentionally copyable
+
+    /// Create an empty slice.
+    StringSlice() : data(const_cast("")), size(0) { }
+
+
+    /// Create a slice that refers to a @c char byte array.
+    StringSlice(const char* d, size_t n) :
+        data(const_cast(d)), size(n) { }
+
+    /// Create a slice that refers to the contents of the given string.
+    StringSlice(const std::string& s) : // NOLINT(runtime/explicit)
+        data(const_cast(s.data())), size(s.size()) { }
+
+    /// Create a slice that refers to a C-string s[0,strlen(s)-1].
+    StringSlice(const char* s) : // NOLINT(runtime/explicit)
+        data(const_cast(s)), size(strlen(s)) { }
+
+    /*
+    /// @return A pointer to the beginning of the referenced data.
+    const char* data() const { return data; }
+
+    /// @return A mutable pointer to the beginning of the referenced data.
+    char* mutable_data() { return const_cast(data); }
+
+    /// @return The length (in bytes) of the referenced data.
+    size_t size() const { return size; }
+    */
+
+    /// @return @c true iff the length of the referenced data is zero.
+    bool empty() const { return size == 0; }
+
+    /// @return the n-th byte in the referenced data.
+    const char operator[](size_t n) const {
+        assert(n < size);
+        return data[n];
+    }
+
+    /// Change this slice to refer to an empty array.
+    void clear() {
+        data = const_cast("");
+        size = 0;
+    }
+
+    /// Drop the first "n" bytes from this slice.
+    ///
+    /// @pre n <= size
+    ///
+    /// @note Only the base and bounds of the slice are changed;
+    ///   the data is not modified.
+    ///
+    /// @param [in] n
+    ///   Number of bytes that should be dropped from the beginning.
+    void remove_prefix(size_t n) {
+        assert(n <= size);
+        data += n;
+        size -= n;
+    }
+
+    /// Truncate the slice to the given number of bytes.
+    ///
+    /// @pre n <= size
+    ///
+    /// @note Only the base and bounds of the slice are changed;
+    ///   the data is not modified.
+    ///
+    /// @param [in] n
+    ///   The new size of the slice.
+    void truncate(size_t n) {
+        assert(n <= size);
+        size = n;
+    }
+
+    /// @return A string that contains a copy of the referenced data.
+    std::string to_string() const { return std::string(data, size); }
+
+    /// Do a three-way comparison of the slice's data.
+    int compare(const StringSlice& b) const;
+
+    /// Check whether the slice starts with the given prefix.
+    bool starts_with(const StringSlice& x) const {
+        return ((size >= x.size) &&
+                (mem_equal(data, x.data, x.size)));
+    }
+
+    /// @brief Comparator struct, useful for ordered collections (like STL maps).
+    struct Comparator {
+        /// Compare two slices using StringSlice::compare()
+        ///
+        /// @param [in] a
+        ///   The slice to call StringSlice::compare() at.
+        /// @param [in] b
+        ///   The slice to use as a parameter for StringSlice::compare().
+        /// @return @c true iff @c a is less than @c b by StringSlice::compare().
+        bool operator()(const StringSlice& a, const StringSlice& b) const {
+            return a.compare(b) < 0;
+        }
+    };
+
+    /// Relocate/copy the slice's data into a new location.
+    ///
+    /// @param [in] d
+    ///   The new location for the data. If it's the same location, then no
+    ///   relocation is done. It is assumed that the new location is
+    ///   large enough to fit the data.
+    void relocate(char* d) {
+        if (data != d) {
+            memcpy(d, data, size);
+            data = d;
+        }
+    }
+
+    friend bool operator==(const StringSlice& x, const StringSlice& y);
+
+    static bool mem_equal(const void* a, const void* b, size_t n) {
+        return memcmp(a, b, n) == 0;
+    }
+
+    static int mem_compare(const void* a, const void* b, size_t n) {
+        return memcmp(a, b, n);
+    }
+
+};
+
+/// Check whether two slices are identical.
+inline bool operator==(const StringSlice& x, const StringSlice& y) {
+    return ((x.size == y.size) &&
+            (StringSlice::mem_equal(x.data, y.data, x.size)));
+}
+
+/// Check whether two slices are not identical.
+inline bool operator!=(const StringSlice& x, const StringSlice& y) {
+    return !(x == y);
+}
+
+inline int StringSlice::compare(const StringSlice& b) const {
+    const int min_len = (size < b.size) ? size : b.size;
+    int r = mem_compare(data, b.data, min_len);
+    if (r == 0) {
+        if (size < b.size) r = -1;
+        else if (size > b.size) r = +1;
+    }
+    return r;
+}
+
+/// @brief STL map whose keys are StringSlices.
+///
+/// An example of usage:
+/// @code
+///   typedef StringSliceMap::type MySliceMap;
+///
+///   MySliceMap my_map;
+///   my_map.insert(MySliceMap::value_type(a, 1));
+///   my_map.insert(MySliceMap::value_type(b, 2));
+///   my_map.insert(MySliceMap::value_type(c, 3));
+///
+///   for (const MySliceMap::value_type& pair : my_map) {
+///     ...
+///   }
+/// @endcode
+template 
+struct StringSliceMap {
+    /// A handy typedef for the slice map with appropriate comparison operator.
+    typedef std::map type;
+};
+
+}  // namespace palo
+
+#endif  // BDG_PALO_BE_SRC_OLAP_STRING_SLICE_H
diff --git a/be/src/olap/types.cpp b/be/src/olap/types.cpp
new file mode 100644
index 0000000000..de81c0ae3a
--- /dev/null
+++ b/be/src/olap/types.cpp
@@ -0,0 +1,79 @@
+// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "olap/types.h"
+
+namespace palo {
+
+template
+TypeInfo::TypeInfo(TypeTraitsClass t)
+      : _equal(TypeTraitsClass::equal),
+        _cmp(TypeTraitsClass::cmp),
+        _copy_with_pool(TypeTraitsClass::copy_with_pool),
+        _copy_without_pool(TypeTraitsClass::copy_without_pool),
+        _from_string(TypeTraitsClass::from_string),
+        _to_string(TypeTraitsClass::to_string),
+        _set_to_max(TypeTraitsClass::set_to_max),
+        _set_to_min(TypeTraitsClass::set_to_min),
+        _is_min(TypeTraitsClass::is_min),
+        _hash_code(TypeTraitsClass::hash_code),
+        _size(TypeTraitsClass::size) {}
+
+class TypeInfoResolver {
+    DECLARE_SINGLETON(TypeInfoResolver);
+public:
+    TypeInfo* get_type_info(const FieldType t) {
+        auto pair = _mapping.find(t);
+        DCHECK(pair != _mapping.end()) << "Bad field type: " << t;
+        return pair->second.get();
+    }
+
+private:
+    template void add_mapping() {
+        TypeTraits traits;
+        _mapping.emplace(field_type,
+                 std::shared_ptr(new TypeInfo(traits)));
+    }
+
+    std::unordered_map,
+        std::hash> _mapping;
+
+    DISALLOW_COPY_AND_ASSIGN(TypeInfoResolver);
+};
+
+TypeInfoResolver::TypeInfoResolver() {
+    add_mapping();
+    add_mapping();
+    add_mapping();
+    add_mapping();
+    add_mapping();
+    add_mapping();
+    add_mapping();
+    add_mapping();
+    add_mapping();
+    add_mapping();
+    add_mapping();
+    add_mapping();
+    add_mapping();
+}
+
+TypeInfoResolver::~TypeInfoResolver() {}
+
+TypeInfo* get_type_info(FieldType field_type) {
+    return TypeInfoResolver::get_instance()->get_type_info(field_type);
+}
+
+} // namespace palo
diff --git a/be/src/olap/types.h b/be/src/olap/types.h
new file mode 100644
index 0000000000..706512c18f
--- /dev/null
+++ b/be/src/olap/types.h
@@ -0,0 +1,886 @@
+// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef BDG_PALO_BE_SRC_OLAP_TYPES_H
+#define BDG_PALO_BE_SRC_OLAP_TYPES_H
+
+#include 
+#include 
+
+#include 
+#include 
+#include 
+
+#include "olap/olap_common.h"
+#include "olap/olap_define.h"
+#include "olap/field_info.h"
+#include "olap/string_slice.h"
+#include "runtime/mem_pool.h"
+#include "util/hash_util.hpp"
+#include "util/mem_util.hpp"
+#include "util/types.h"
+
+namespace palo {
+
+class TypeInfo {
+public:
+    inline int equal(char* left, char* right) const {
+        return _equal(left, right);
+    }
+
+    inline int cmp(char* left, char* right) const {
+        return _cmp(left, right);
+    }
+
+    inline void copy_with_pool(char* dest, const char* src, MemPool* mem_pool) {
+        _copy_with_pool(dest, src, mem_pool);
+    }
+
+    inline void copy_without_pool(char* dest, const char* src) {
+        _copy_without_pool(dest, src);
+    }
+
+    OLAPStatus from_string(char* buf, const std::string& scan_key) {
+        return _from_string(buf, scan_key);
+    }
+
+    std::string to_string(char* src) { return _to_string(src); }
+
+    inline void set_to_max(char* buf) { _set_to_max(buf); }
+    inline void set_to_min(char* buf) { _set_to_min(buf); }
+    inline bool is_min(char* buf) { return _is_min(buf); }
+
+    inline uint32_t hash_code(char* data, uint32_t seed) { return _hash_code(data, seed); }
+    inline const size_t size() const { return _size; }
+
+private:
+    int (*_equal)(const void* left, const void* right);
+    int (*_cmp)(const void* left, const void* right);
+
+    void (*_copy_with_pool)(char* dest, const char* src, MemPool* mem_pool);
+    void (*_copy_without_pool)(char* dest, const char* src);
+
+    OLAPStatus (*_from_string)(char* buf, const std::string& scan_key);
+    std::string (*_to_string)(char* src);
+
+    void (*_set_to_max)(char* buf);
+    void (*_set_to_min)(char* buf);
+    bool (*_is_min)(char* buf);
+
+    uint32_t (*_hash_code)(char* data, uint32_t seed);
+
+    const size_t _size;
+
+    friend class TypeInfoResolver;
+    template TypeInfo(TypeTraitsClass t);
+};
+
+extern TypeInfo* get_type_info(FieldType field_type);
+
+// TODO: NullOffset
+struct NullOffset {
+    int byte_offset;
+    uint8_t bit_mask;  // to extract null
+
+    NullOffset(int byte_offset, int bit_offset)
+        : byte_offset(byte_offset),
+          bit_mask(bit_offset == -1 ? 0 : 1 << (7 - bit_offset)) {}
+};
+
+template struct FieldTypeTraits {};
+
+template
+static int generic_equal(const void* left, const void* right) {
+    typedef typename FieldTypeTraits::CppType CppType;
+    CppType l_value = *reinterpret_cast(left);
+    CppType r_value = *reinterpret_cast(right);
+    return l_value == r_value;
+};
+
+template
+static int generic_compare(const void* left, const void* right) {
+    typedef typename FieldTypeTraits::CppType CppType;
+    CppType left_int = *reinterpret_cast(left);
+    CppType right_int = *reinterpret_cast(right);
+    if (left_int < right_int) {
+        return -1;
+    } else if (left_int > right_int) {
+        return 1;
+    } else {
+        return 0;
+    }
+};
+
+template
+static void generic_copy(char* dest, const char* src) {
+    typedef typename FieldTypeTraits::CppType CppType;
+    *reinterpret_cast(dest) = *reinterpret_cast(src);
+}
+
+template<>
+void generic_copy(char* dest, const char* src) {
+    *reinterpret_cast(dest) = *reinterpret_cast(src);
+}
+
+template
+static std::string generic_to_string(char* src) {
+    typedef typename FieldTypeTraits::CppType CppType;
+    std::stringstream stream;
+    stream << *reinterpret_cast(src);
+    return stream.str();
+}
+
+template
+static OLAPStatus generic_from_string(char* buf, const std::string& scan_key) {
+    typedef typename FieldTypeTraits::CppType CppType;
+    CppType value = 0;
+    if (scan_key.length() > 0) {
+        value = static_cast(strtol(scan_key.c_str(), NULL, 10));
+    }
+    *reinterpret_cast(buf) = value;
+    return OLAP_SUCCESS;
+}
+
+template
+static void generic_set_to_max(char* buf) {
+    typedef typename FieldTypeTraits::CppType CppType;
+    *reinterpret_cast(buf) = std::numeric_limits::max();
+}
+
+template
+static void generic_set_to_min(char* buf) {
+    typedef typename FieldTypeTraits::CppType CppType;
+    *reinterpret_cast(buf) = std::numeric_limits::min();
+}
+
+template
+static bool generic_is_min(char* buf) {
+    typedef typename FieldTypeTraits::CppType CppType;
+    CppType min_value = std::numeric_limits::min();
+    return (*reinterpret_cast(buf) == min_value);
+}
+
+template
+static uint32_t generic_hash_code(char* data, uint32_t seed) {
+    typedef typename FieldTypeTraits::CppType CppType;
+    return HashUtil::hash(data, sizeof(CppType), seed);
+}
+
+template<>
+struct FieldTypeTraits {
+    typedef int8_t CppType;
+    static const char* name() {
+        return "int8_t";
+    }
+    static int equal(const void* left, const void* right) {
+        return generic_equal(left, right);
+    }
+    static int cmp(const void* left, const void* right) {
+        return generic_compare(left, right);
+    }
+    static std::string to_string(char* src) {
+        char buf[1024] = {'\0'};
+        snprintf(buf, sizeof(buf), "%d", *reinterpret_cast(src));
+        return std::string(buf);
+    }
+    static OLAPStatus from_string(char* buf, const std::string& scan_key) {
+        return generic_from_string(buf, scan_key);
+    }
+    static void copy_with_pool(char* dest, const char* src, MemPool* mem_pool) {
+        generic_copy(dest, src);
+    }
+    static void copy_without_pool(char* dest, const char* src) {
+        generic_copy(dest, src);
+    }
+    static void set_to_max(char* buf) {
+        generic_set_to_max(buf);
+    }
+    static void set_to_min(char* buf) {
+        generic_set_to_min(buf);
+    }
+    static bool is_min(char* buf) {
+        return generic_is_min(buf);
+    }
+    static uint32_t hash_code(char* data, uint32_t seed) {
+        return generic_hash_code(data, seed);
+    }
+};
+
+template<>
+struct FieldTypeTraits {
+    typedef int16_t CppType;
+    static const char* name() {
+        return "int16_t";
+    }
+    static int equal(const void* left, const void* right) {
+        return generic_equal(left, right);
+    }
+    static int cmp(const void* left, const void* right) {
+        return generic_compare(left, right);
+    }
+    static std::string to_string(char* src) {
+        return generic_to_string(src);
+    }
+    static OLAPStatus from_string(char* buf, const std::string& scan_key) {
+        return generic_from_string(buf, scan_key);
+    }
+    static void copy_with_pool(char* dest, const char* src, MemPool* mem_pool) {
+        generic_copy(dest, src);
+    }
+    static void copy_without_pool(char* dest, const char* src) {
+        generic_copy(dest, src);
+    }
+    static void set_to_max(char* buf) {
+        generic_set_to_max(buf);
+    }
+    static void set_to_min(char* buf) {
+        generic_set_to_min(buf);
+    }
+    static bool is_min(char* buf) {
+        return generic_is_min(buf);
+    }
+    static uint32_t hash_code(char* data, uint32_t seed) {
+        return generic_hash_code(data, seed);
+    }
+};
+
+template<>
+struct FieldTypeTraits {
+    typedef int32_t CppType;
+    static const char* name() {
+        return "int32_t";
+    }
+    static int equal(const void* left, const void* right) {
+        return generic_equal(left, right);
+    }
+    static int cmp(const void* left, const void* right) {
+        return generic_compare(left, right);
+    }
+    static std::string to_string(char* src) {
+        return generic_to_string(src);
+    }
+    static OLAPStatus from_string(char* buf, const std::string& scan_key) {
+        return generic_from_string(buf, scan_key);
+    }
+    static void copy_with_pool(char* dest, const char* src, MemPool* mem_pool) {
+        generic_copy(dest, src);
+    }
+    static void copy_without_pool(char* dest, const char* src) {
+        generic_copy(dest, src);
+    }
+    static void set_to_max(char* buf) {
+        generic_set_to_max(buf);
+    }
+    static void set_to_min(char* buf) {
+        generic_set_to_min(buf);
+    }
+    static bool is_min(char* buf) {
+        return generic_is_min(buf);
+    }
+    static uint32_t hash_code(char* data, uint32_t seed) {
+        return generic_hash_code(data, seed);
+    }
+};
+
+template<>
+struct FieldTypeTraits {
+    typedef int64_t CppType;
+    static const char* name() {
+        return "int64_t";
+    }
+    static int equal(const void* left, const void* right) {
+        return generic_equal(left, right);
+    }
+    static int cmp(const void* left, const void* right) {
+        return generic_compare(left, right);
+    }
+    static OLAPStatus from_string(char* buf, const std::string& scan_key) {
+        return generic_from_string(buf, scan_key);
+    }
+    static std::string to_string(char* src) {
+        return generic_to_string(src);
+    }
+    static void copy_with_pool(char* dest, const char* src, MemPool* mem_pool) {
+        generic_copy(dest, src);
+    }
+    static void copy_without_pool(char* dest, const char* src) {
+        generic_copy(dest, src);
+    }
+    static void set_to_max(char* buf) {
+        generic_set_to_max(buf);
+    }
+    static void set_to_min(char* buf) {
+        generic_set_to_min(buf);
+    }
+    static bool is_min(char* buf) {
+        return generic_is_min(buf);
+    }
+    static uint32_t hash_code(char* data, uint32_t seed) {
+        return generic_hash_code(data, seed);
+    }
+};
+
+template<>
+struct FieldTypeTraits {
+    typedef int128_t CppType;
+    static const char* name() {
+        return "int128_t";
+    }
+    static int equal(const void* left, const void* right) {
+        return generic_equal(left, right);
+    }
+    static int cmp(const void* left, const void* right) {
+        return generic_compare(left, right);
+    }
+    static OLAPStatus from_string(char* buf, const std::string& scan_key) {
+        int128_t value = 0;
+
+        const char* value_string = scan_key.c_str();
+        char* end = NULL;
+        value = strtol(value_string, &end, 10);
+        if (*end != 0) {
+            value = 0;
+        } else if (value > LONG_MIN && value < LONG_MAX) {
+            // use strtol result directly
+        } else {
+            bool is_negative = false;
+            if (*value_string == '-' || *value_string == '+') {
+                if (*(value_string++) == '-') {
+                    is_negative = true;
+                }
+            }
+
+            uint128_t current = 0;
+            uint128_t max_int128 = ~((int128_t)(1) << 127);
+            while (*value_string != 0) {
+                if (current > max_int128 / 10) {
+                    break;
+                }
+
+                current = current * 10 + (*(value_string++) - '0');
+            }
+            if (*value_string != 0
+                || (!is_negative && current > max_int128)
+                || ( is_negative&& current > max_int128 + 1)) {
+                current = 0;
+            }
+
+            value = is_negative ? -current : current;
+        }
+
+        *reinterpret_cast(buf) = value;
+
+        return OLAP_SUCCESS;
+    }
+    static std::string to_string(char* src) {
+        char buf[1024];
+        int128_t value = reinterpret_cast(src)->value;
+        if (value >= std::numeric_limits::min()
+            && value <= std::numeric_limits::max()) {
+            snprintf(buf, sizeof(buf), "%ld", (int64_t)value);
+        } else {
+            char* current = buf;
+            uint128_t abs_value = value;
+            if (value < 0) {
+                *(current++) = '-';
+                abs_value = -value;
+            }
+
+            // the max value of uint64_t is 18446744073709551615UL,
+            // so use Z19_UINT64 to divide uint128_t 
+            const static uint64_t Z19_UINT64 = 10000000000000000000ULL;
+            uint64_t suffix = abs_value % Z19_UINT64;
+            uint64_t middle = abs_value / Z19_UINT64 % Z19_UINT64;
+            uint64_t prefix = abs_value / Z19_UINT64 / Z19_UINT64;
+
+            char* end = buf + sizeof(buf);
+            if (prefix > 0) {
+                current += snprintf(current, end - current, "%" PRIu64, prefix);
+                current += snprintf(current, end - current, "%.19" PRIu64, middle);
+                current += snprintf(current, end - current, "%.19" PRIu64, suffix);
+            } else if (OLAP_LIKELY(middle > 0)) {
+                current += snprintf(current, end - current, "%" PRIu64, middle);
+                current += snprintf(current, end - current, "%.19" PRIu64, suffix);
+            } else {
+                current += snprintf(current, end - current, "%" PRIu64, suffix);
+            }
+        }
+
+        return std::string(buf);
+    }
+    // GCC7.3 will generate movaps instruction, which will lead to SEGV when buf is
+    // not aligned to 16 byte
+    static void copy_with_pool(char* dest, const char* src, MemPool* mem_pool) {
+        generic_copy(dest, src);
+    }
+    static void copy_without_pool(char* dest, const char* src) {
+        generic_copy(dest, src);
+    }
+    static void set_to_max(char* buf) {
+        *reinterpret_cast(buf) = ~((int128_t)(1) << 127);
+    }
+    static void set_to_min(char* buf) {
+        *reinterpret_cast(buf) = (int128_t)(1) << 127;
+    }
+    static bool is_min(char* buf) {
+        int128_t min_value = (CppType)(1) << 127;
+        return reinterpret_cast(buf)->value == min_value;
+    }
+    static uint32_t hash_code(char* data, uint32_t seed) {
+        return generic_hash_code(data, seed);
+    }
+};
+
+template<>
+struct FieldTypeTraits {
+    typedef float CppType;
+    static const char* name() {
+        return "float";
+    }
+    static int equal(const void* left, const void* right) {
+        return generic_equal(left, right);
+    }
+    static int cmp(const void* left, const void* right) {
+        return generic_compare(left, right);
+    }
+    static OLAPStatus from_string(char* buf, const std::string& scan_key) {
+        CppType value = 0.0f;
+
+        if (scan_key.length() > 0) {
+            value = static_cast(atof(scan_key.c_str()));
+        }
+
+        *reinterpret_cast(buf) = value;
+        return OLAP_SUCCESS;
+    }
+    static std::string to_string(char* src) {
+        return generic_to_string(src);
+    }
+    static void copy_with_pool(char* dest, const char* src, MemPool* mem_pool) {
+        generic_copy(dest, src);
+    }
+    static void copy_without_pool(char* dest, const char* src) {
+        generic_copy(dest, src);
+    }
+    static void set_to_max(char* buf) {
+        generic_set_to_max(buf);
+    }
+    static void set_to_min(char* buf) {
+        generic_set_to_min(buf);
+    }
+    static bool is_min(char* buf) {
+        return generic_is_min(buf);
+    }
+    static uint32_t hash_code(char* data, uint32_t seed) {
+        return generic_hash_code(data, seed);
+    }
+};
+
+template<>
+struct FieldTypeTraits {
+    typedef double CppType;
+    static const char* name() {
+        return "double";
+    }
+    static int equal(const void* left, const void* right) {
+        return generic_equal(left, right);
+    }
+    static int cmp(const void* left, const void* right) {
+        return generic_compare(left, right);
+    }
+    static OLAPStatus from_string(char* buf, const std::string& scan_key) {
+        double value = 0.0;
+
+        if (scan_key.length() > 0) {
+            value = atof(scan_key.c_str());
+        }
+
+        *reinterpret_cast(buf) = value;
+        return OLAP_SUCCESS;
+    }
+    static std::string to_string(char* src) {
+        char buf[1024] = {'\0'};
+        snprintf(buf, sizeof(buf), "%.10f", *reinterpret_cast(src));
+        return std::string(buf);
+    }
+    static void copy_with_pool(char* dest, const char* src, MemPool* mem_pool) {
+        generic_copy(dest, src);
+    }
+    static void copy_without_pool(char* dest, const char* src) {
+        generic_copy(dest, src);
+    }
+    static void set_to_max(char* buf) {
+        generic_set_to_max(buf);
+    }
+    static void set_to_min(char* buf) {
+        generic_set_to_min(buf);
+    }
+    static bool is_min(char* buf) {
+        return generic_is_min(buf);
+    }
+    static uint32_t hash_code(char* data, uint32_t seed) {
+        return generic_hash_code(data, seed);
+    }
+};
+
+template<>
+struct FieldTypeTraits {
+    typedef decimal12_t CppType;
+    static const char* name() {
+        return "decimal";
+    }
+    static int equal(const void* left, const void* right) {
+        return generic_equal(left, right);
+    }
+    static int cmp(const void* left, const void* right) {
+        return generic_compare(left, right);
+    }
+    static OLAPStatus from_string(char* buf, const std::string& scan_key) {
+        decimal12_t* data_ptr = reinterpret_cast(buf);
+        return data_ptr->from_string(scan_key);
+    }
+    static std::string to_string(char* src) {
+        decimal12_t* data_ptr = reinterpret_cast(src);
+        return data_ptr->to_string();
+    }
+    static void copy_with_pool(char* dest, const char* src, MemPool* mem_pool) {
+        generic_copy(dest, src);
+    }
+    static void copy_without_pool(char* dest, const char* src) {
+        generic_copy(dest, src);
+    }
+    static void set_to_max(char* buf) {
+        decimal12_t* data = reinterpret_cast(buf);
+        data->integer = 999999999999999999L;
+        data->fraction = 999999999;
+    }
+    static void set_to_min(char* buf) {
+        decimal12_t* data = reinterpret_cast(buf);
+        data->integer = -999999999999999999;
+        data->fraction = -999999999;
+    }
+    static bool is_min(char* buf) {
+        decimal12_t* data = reinterpret_cast(buf);
+        return (data->integer == -999999999999999999L
+                && data->fraction == -999999999);
+    }
+    static uint32_t hash_code(char* data, uint32_t seed) {
+        return generic_hash_code(data, seed);
+    }
+};
+
+template<>
+struct FieldTypeTraits {
+    typedef uint24_t CppType;
+    static const char* name() {
+        return "date";
+    }
+    static int equal(const void* left, const void* right) {
+        return generic_equal(left, right);
+    }
+    static int cmp(const void* left, const void* right) {
+        return generic_compare(left, right);
+    }
+    static OLAPStatus from_string(char* buf, const std::string& scan_key) {
+        tm time_tm;
+        char* res = strptime(scan_key.c_str(), "%Y-%m-%d", &time_tm);
+
+        if (NULL != res) {
+            int value = (time_tm.tm_year + 1900) * 16 * 32
+                + (time_tm.tm_mon + 1) * 32
+                + time_tm.tm_mday;
+            *reinterpret_cast(buf) = value;
+        } else {
+            // 1400 - 01 - 01
+            *reinterpret_cast(buf) = 716833;
+        }
+
+        return OLAP_SUCCESS;
+    }
+    static std::string to_string(char* src) {
+        tm time_tm;
+        int value = *reinterpret_cast(src);
+        memset(&time_tm, 0, sizeof(time_tm));
+        time_tm.tm_mday = static_cast(value & 31);
+        time_tm.tm_mon = static_cast(value >> 5 & 15) - 1;
+        time_tm.tm_year = static_cast(value >> 9) - 1900;
+        char buf[20] = {'\0'};
+        strftime(buf, sizeof(buf), "%Y-%m-%d", &time_tm);
+        return std::string(buf);
+    }
+    static void copy_with_pool(char* dest, const char* src, MemPool* mem_pool) {
+        generic_copy(dest, src);
+    }
+    static void copy_without_pool(char* dest, const char* src) {
+        generic_copy(dest, src);
+    }
+    static void set_to_max(char* buf) {
+        // max is 9999 * 16 * 32 + 12 * 32 + 31;
+        *reinterpret_cast(buf) = 5119903;
+    }
+    static void set_to_min(char* buf) {
+        // min is 0 * 16 * 32 + 1 * 32 + 1;
+        *reinterpret_cast(buf) = 33;
+    }
+    static bool is_min(char* buf) {
+        CppType value = *reinterpret_cast(buf);
+        return (33 == value);
+    }
+    static uint32_t hash_code(char* data, uint32_t seed) {
+        return generic_hash_code(data, seed);
+    }
+};
+
+template<>
+struct FieldTypeTraits {
+    typedef int64_t CppType;
+    static const char* name() {
+        return "datetime";
+    }
+    static int equal(const void* left, const void* right) {
+        return generic_equal(left, right);
+    }
+    static int cmp(const void* left, const void* right) {
+        return generic_compare(left, right);
+    }
+    static OLAPStatus from_string(char* buf, const std::string& scan_key) {
+        tm time_tm;
+        char* res = strptime(scan_key.c_str(), "%Y-%m-%d %H:%M:%S", &time_tm);
+
+        if (NULL != res) {
+            int64_t value = ((time_tm.tm_year + 1900) * 10000L
+                            + (time_tm.tm_mon + 1) * 100L
+                            + time_tm.tm_mday) * 1000000L
+                            + time_tm.tm_hour * 10000L
+                            + time_tm.tm_min * 100L
+                            + time_tm.tm_sec;
+            *reinterpret_cast(buf) = value;
+        } else {
+            // 1400 - 01 - 01
+            *reinterpret_cast(buf) = 14000101000000L;
+        }
+
+        return OLAP_SUCCESS;
+    }
+    static std::string to_string(char* src) {
+        tm time_tm;
+        int64_t tmp = *reinterpret_cast(src);
+        int64_t part1 = (tmp / 1000000L);
+        int64_t part2 = (tmp - part1 * 1000000L);
+
+        time_tm.tm_year = static_cast((part1 / 10000L) % 10000) - 1900;
+        time_tm.tm_mon = static_cast((part1 / 100) % 100) - 1;
+        time_tm.tm_mday = static_cast(part1 % 100);
+
+        time_tm.tm_hour = static_cast((part2 / 10000L) % 10000);
+        time_tm.tm_min = static_cast((part2 / 100) % 100);
+        time_tm.tm_sec = static_cast(part2 % 100);
+
+        char buf[20] = {'\0'};
+        strftime(buf, 20, "%Y-%m-%d %H:%M:%S", &time_tm);
+        return std::string(buf);
+    }
+    static void copy_with_pool(char* dest, const char* src, MemPool* mem_pool) {
+        generic_copy(dest, src);
+    }
+    static void copy_without_pool(char* dest, const char* src) {
+        generic_copy(dest, src);
+    }
+    static void set_to_max(char* buf) {
+        // 设置为最大时间,其å«ä¹‰ä¸ºï¼š9999-12-31 23:59:59
+        *reinterpret_cast(buf) = 99991231235959L;
+    }
+    static void set_to_min(char* buf) {
+        *reinterpret_cast(buf) = 101000000;
+    }
+    static bool is_min(char* buf) {
+        CppType value = *reinterpret_cast(buf);
+        return (value == 101000000);
+    }
+    static uint32_t hash_code(char* data, uint32_t seed) {
+        return generic_hash_code(data, seed);
+    }
+};
+
+template<>
+struct FieldTypeTraits {
+    typedef StringSlice CppType;
+    static const char* name() {
+        return "char";
+    }
+    static int equal(const void* left, const void* right) {
+        const StringSlice* l_slice = reinterpret_cast(left);
+        const StringSlice* r_slice = reinterpret_cast(right);
+        return *l_slice == *r_slice;
+    }
+    static int cmp(const void* left, const void* right) {
+        const StringSlice* l_slice = reinterpret_cast(left);
+        const StringSlice* r_slice = reinterpret_cast(right);
+        return l_slice->compare(*r_slice);
+    }
+    static OLAPStatus from_string(char* buf, const std::string& scan_key) {
+        size_t value_len = scan_key.length();
+        if (value_len > OLAP_STRING_MAX_LENGTH) {
+            OLAP_LOG_WARNING("the len of value string is too long[len=%lu, max_len=%lu].",
+                             value_len, OLAP_STRING_MAX_LENGTH);
+            return OLAP_ERR_INPUT_PARAMETER_ERROR;
+        }
+
+        StringSlice* slice = reinterpret_cast(buf);
+        memory_copy(slice->data, scan_key.c_str(), value_len);
+        if (slice->size < value_len) {
+            /*
+             * CHAR type is of fixed length. Size in slice can be modified
+             * only if value_len is greater than the fixed length. ScanKey
+             * inputed by user may be greater than fixed length.
+             */
+            slice->size = value_len;
+        }
+        return OLAP_SUCCESS;
+    }
+    static std::string to_string(char* src) {
+        StringSlice* slice = reinterpret_cast(src);
+        return slice->to_string();
+    }
+    static void copy_with_pool(char* dest, const char* src, MemPool* mem_pool) {
+        StringSlice* l_slice = reinterpret_cast(dest);
+        const StringSlice* r_slice = reinterpret_cast(src);
+        l_slice->data = reinterpret_cast(mem_pool->allocate(r_slice->size));
+        memory_copy(l_slice->data, r_slice->data, r_slice->size);
+        l_slice->size = r_slice->size;
+    }
+    static void copy_without_pool(char* dest, const char* src) {
+        StringSlice* l_slice = reinterpret_cast(dest);
+        const StringSlice* r_slice = reinterpret_cast(src);
+        memory_copy(l_slice->data, r_slice->data, r_slice->size);
+        l_slice->size = r_slice->size;
+    }
+    static void set_to_max(char* buf) {
+        // this function is used by scan key,
+        // the size may be greater than length in schema.
+        StringSlice* slice = reinterpret_cast(buf);
+        memset(slice->data, 0xff, slice->size);
+    }
+    static void set_to_min(char* buf) {
+        StringSlice* slice = reinterpret_cast(buf);
+        memset(slice->data, 0, slice->size);
+    }
+    static bool is_min(char* buf) {
+        StringSlice* slice = reinterpret_cast(buf);
+        size_t i = 0;
+        while (i < slice->size) {
+            if (slice->data[i] != '\0') {
+                return false;
+            }
+            i++;
+        }
+        return true;
+    }
+    static uint32_t hash_code(char* data, uint32_t seed) {
+        StringSlice* slice = reinterpret_cast(data);
+        return HashUtil::hash(slice->data, slice->size, seed);
+    }
+};
+
+template<>
+struct FieldTypeTraits {
+    typedef StringSlice CppType;
+    static const char* name() {
+        return "varchar";
+    }
+    static int equal(const void* left, const void* right) {
+        const StringSlice* l_slice = reinterpret_cast(left);
+        const StringSlice* r_slice = reinterpret_cast(right);
+        return *l_slice == *r_slice;
+    }
+    static int cmp(const void* left, const void* right) {
+        const StringSlice* l_slice = reinterpret_cast(left);
+        const StringSlice* r_slice = reinterpret_cast(right);
+        return l_slice->compare(*r_slice);
+    }
+    static OLAPStatus from_string(char* buf, const std::string& scan_key) {
+        size_t value_len = scan_key.length();
+        if (value_len > OLAP_STRING_MAX_LENGTH) {
+            OLAP_LOG_WARNING("the len of value string is too long[len=%lu, max_len=%lu].",
+                             value_len, OLAP_STRING_MAX_LENGTH);
+            return OLAP_ERR_INPUT_PARAMETER_ERROR;
+        }
+
+        StringSlice* slice = reinterpret_cast(buf);
+        memory_copy(slice->data, scan_key.c_str(), value_len);
+        slice->size = value_len;
+        return OLAP_SUCCESS;
+    }
+    static std::string to_string(char* src) {
+        StringSlice* slice = reinterpret_cast(src);
+        return slice->to_string();
+    }
+    static void copy_with_pool(char* dest, const char* src, MemPool* mem_pool) {
+        StringSlice* l_slice = reinterpret_cast(dest);
+        const StringSlice* r_slice = reinterpret_cast(src);
+
+        l_slice->data = reinterpret_cast(mem_pool->allocate(r_slice->size));
+        memory_copy(l_slice->data, r_slice->data, r_slice->size);
+        l_slice->size = r_slice->size;
+    }
+    static void copy_without_pool(char* dest, const char* src) {
+        StringSlice* l_slice = reinterpret_cast(dest);
+        const StringSlice* r_slice = reinterpret_cast(src);
+        memory_copy(l_slice->data, r_slice->data, r_slice->size);
+        l_slice->size = r_slice->size;
+    }
+    static void set_to_max(char* buf) {
+        StringSlice* slice = reinterpret_cast(buf);
+        slice->size = 1;
+        memset(slice->data, 0xFF, 1);
+    }
+    static void set_to_min(char* buf) {
+        StringSlice* slice = reinterpret_cast(buf);
+        slice->size = 0;
+    }
+    static bool is_min(char* buf) {
+        StringSlice* slice = reinterpret_cast(buf);
+        return (slice->size == 0);
+    }
+    static uint32_t hash_code(char* data, uint32_t seed) {
+        StringSlice* slice = reinterpret_cast(data);
+        return HashUtil::hash(slice->data, slice->size, seed);
+    }
+};
+
+template<>
+struct FieldTypeTraits : public FieldTypeTraits {
+    /*
+     * Hyperloglog type only used as value, so
+     * cmp/from_string/set_to_max/set_to_min/is_min function
+     * in this struct has no significance
+     */
+    static const char* name() {
+        return "hyperloglog";
+    }
+};
+
+// Instantiate this template to get static access to the type traits.
+template
+struct TypeTraits : public FieldTypeTraits {
+    typedef typename FieldTypeTraits::CppType CppType;
+
+    static const FieldType type = field_type;
+    static const int32_t size = sizeof(CppType);
+};
+
+} // namespace palo
+
+#endif // BDG_PALO_BE_SRC_OLAP_TYPES_H
diff --git a/be/src/olap/wrapper_field.cpp b/be/src/olap/wrapper_field.cpp
new file mode 100644
index 0000000000..d432bd2fc4
--- /dev/null
+++ b/be/src/olap/wrapper_field.cpp
@@ -0,0 +1,75 @@
+// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "olap/wrapper_field.h"
+
+namespace palo {
+
+WrapperField* WrapperField::create(const FieldInfo& info, uint32_t len) {
+    bool is_string_type =
+        (info.type == OLAP_FIELD_TYPE_CHAR || info.type == OLAP_FIELD_TYPE_VARCHAR);
+    if (is_string_type && len > OLAP_STRING_MAX_LENGTH) {
+        OLAP_LOG_WARNING("length of string parameter is too long[len=%lu, max_len=%lu].",
+                        len, OLAP_STRING_MAX_LENGTH);
+        return nullptr;
+    }
+
+    Field* rep = Field::create(info);
+    if (rep == nullptr) {
+        return nullptr;
+    }
+
+    size_t variable_len = 0;
+    if (info.type == OLAP_FIELD_TYPE_CHAR) {
+        variable_len = std::max(len, info.length);
+    } else if (info.type == OLAP_FIELD_TYPE_VARCHAR) {
+        variable_len = std::max(len,
+                static_cast(info.length - sizeof(StringLengthType)));
+    } else {
+        variable_len = info.length;
+    }
+
+    WrapperField* wrapper = new WrapperField(rep, variable_len, is_string_type);
+    return wrapper;
+}
+
+WrapperField* WrapperField::create_by_type(const FieldType& type) {
+    Field* rep = Field::create_by_type(type);
+    if (rep == nullptr) {
+        return nullptr;
+    }
+    bool is_string_type = (type == OLAP_FIELD_TYPE_CHAR || type == OLAP_FIELD_TYPE_VARCHAR);
+    WrapperField* wrapper = new WrapperField(rep, 0, is_string_type);
+    return wrapper;
+}
+
+WrapperField::WrapperField(Field* rep, size_t variable_len, bool is_string_type)
+        : _rep(rep), _is_string_type(is_string_type)  {
+    size_t fixed_len = _rep->size();
+    _length = fixed_len + variable_len + 1;
+    _field_buf = new char[_length];
+    memset(_field_buf, 0, _length);
+    _owned_buf = _field_buf;
+    _is_null = _field_buf;
+    _buf = _field_buf + 1;
+
+    if (_is_string_type) {
+        StringSlice* slice = reinterpret_cast(_buf);
+        slice->size = variable_len;
+        slice->data = _buf + fixed_len;
+    }
+}
+
+}
diff --git a/be/src/olap/wrapper_field.h b/be/src/olap/wrapper_field.h
new file mode 100644
index 0000000000..e18afd3d6e
--- /dev/null
+++ b/be/src/olap/wrapper_field.h
@@ -0,0 +1,142 @@
+// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef BDG_PALO_BE_SRC_OLAP_WRAPPER_FIELD_H
+#define BDG_PALO_BE_SRC_OLAP_WRAPPER_FIELD_H
+
+#include "olap/field.h"
+#include "olap/olap_define.h"
+#include "util/hash_util.hpp"
+
+namespace palo {
+
+class WrapperField {
+public:
+    static WrapperField* create(const FieldInfo& info, uint32_t len = 0);
+    static WrapperField* create_by_type(const FieldType& type);
+
+    WrapperField(Field* rep, size_t variable_len, bool is_string_type);
+
+    virtual ~WrapperField() {
+        delete _rep;
+        delete [] _owned_buf;
+    }
+
+    // 将内部的value转æˆstring输出
+    // 没有考虑实现的性能,仅供DEBUG使用
+    std::string to_string() const {
+        return _rep->to_string(_buf);
+    }
+
+    // 从传入的字符串ååºåˆ—化field的值
+    // 傿•°å¿…须是一个\0结尾的字符串
+    OLAPStatus from_string(const std::string& value_string) {
+        _rep->from_string(_buf, value_string);
+        return OLAP_SUCCESS;
+    }
+
+    // attach到一段buf
+    void attach_buf(char* buf) {
+        _buf = buf;
+    }
+
+    void attach_field(char* field) {
+        _field_buf = field;
+        _is_null = _field_buf;
+        _buf = _field_buf + 1;
+    }
+
+    bool is_string_type() const { return _is_string_type; }
+
+    char* ptr() const {
+        return _buf;
+    }
+
+    char* field_ptr() const {
+        return _field_buf;
+    }
+
+    size_t size() const {
+        return _rep->size();
+    }
+
+    size_t field_size() const {
+        return _rep->field_size();
+    }
+
+    bool is_null() const {
+        return *reinterpret_cast(_is_null);
+    }
+
+    void set_null() {
+        *reinterpret_cast(_is_null) = true;
+    }
+
+    void set_not_null() {
+        *reinterpret_cast(_is_null) = false;
+    }
+
+    char* get_null() const {
+        return _is_null;
+    }
+
+    void set_to_max() {
+        _rep->set_to_max(_buf);
+    }
+
+    void set_to_min() {
+        _rep->set_to_min(_buf);
+    }
+
+    bool is_min() {
+        return _rep->is_min(_buf);
+    }
+
+    int cmp(const WrapperField* field) const {
+        return _rep->cmp(_field_buf, field->field_ptr());
+    }
+
+    int cmp(char* right) const {
+        return _rep->cmp(_field_buf, right);
+    }
+
+    void copy(const WrapperField* field) {
+        _rep->copy_without_pool(_field_buf, field->field_ptr());
+    }
+
+    void copy(char* src) {
+        _rep->copy_without_pool(_field_buf, src);
+    }
+
+    uint32_t hash_code() const {
+        uint32_t hash_code = 0;
+        return _rep->hash_code(_buf + _rep->get_offset(), hash_code);
+    }
+
+private:
+    Field* _rep = nullptr;
+    bool _is_string_type;
+    char* _field_buf = nullptr;
+    char* _owned_buf = nullptr;
+    char* _buf = nullptr;
+    char* _is_null = nullptr;
+
+    //include fixed and variable length and null bytes
+    size_t _length;
+};
+
+}
+
+#endif
diff --git a/be/src/olap/writer.cpp b/be/src/olap/writer.cpp
index b57ea64441..ceaa26e70d 100644
--- a/be/src/olap/writer.cpp
+++ b/be/src/olap/writer.cpp
@@ -119,7 +119,7 @@ OLAPStatus OLAPDataWriter::init(uint32_t num_rows_per_row_block) {
     if (_is_push_write) {
         _write_mbytes_per_sec = config::push_write_mbytes_per_sec;
     } else {
-        _write_mbytes_per_sec = config::base_expansion_write_mbytes_per_sec;
+        _write_mbytes_per_sec = config::base_compaction_write_mbytes_per_sec;
     }
     
     _speed_limit_watch.reset();
@@ -134,13 +134,8 @@ OLAPStatus OLAPDataWriter::attached_by(RowCursor* row_cursor) {
             return OLAP_ERR_OTHER_ERROR;
         }
     }
-
     // Row points to the memory that needs to write in _row_block.
-    if (OLAP_SUCCESS != _row_block->get_row_to_write(_row_index, row_cursor)) {
-        OLAP_LOG_WARNING("fail to get row in row_block. [row_num=%u]", _row_index);
-        return OLAP_ERR_OTHER_ERROR;
-    }
-
+    _row_block->get_row(_row_index, row_cursor);
     return OLAP_SUCCESS;
 }
 
@@ -205,7 +200,7 @@ OLAPStatus OLAPDataWriter::write_row_block(RowBlock* row_block) {
     // Add row block into olap data.
     uint32_t start_offset;
     uint32_t end_offset;
-    if (OLAP_SUCCESS != _data->add_row_block(*row_block,
+    if (OLAP_SUCCESS != _data->add_row_block(row_block,
                                              &start_offset,
                                              &end_offset)) {
         OLAP_LOG_WARNING("fail to write data.");
@@ -273,4 +268,8 @@ uint64_t OLAPDataWriter::written_bytes() {
     return _current_segment_size + _index->num_segments() * _max_segment_size;
 }
 
+MemPool* OLAPDataWriter::mem_pool() {
+    return _row_block->mem_pool();
+}
+
 }  // namespace palo
diff --git a/be/src/olap/writer.h b/be/src/olap/writer.h
index 1f8f5d60ac..10677767dd 100644
--- a/be/src/olap/writer.h
+++ b/be/src/olap/writer.h
@@ -17,6 +17,7 @@
 #define BDG_PALO_BE_SRC_OLAP_WRITER_H
 
 #include "olap/olap_table.h"
+#include "olap/wrapper_field.h"
 
 namespace palo {
 class OLAPData;
@@ -33,7 +34,8 @@ public:
     IWriter(bool is_push_write, SmartOLAPTable table) : 
             _is_push_write(is_push_write), 
             _table(table),
-            _column_statistics(_table->num_key_fields(), std::pair(NULL, NULL)),
+            _column_statistics(
+                _table->num_key_fields(), std::pair(NULL, NULL)),
             _row_index(0) {}
     virtual ~IWriter() {
         for (size_t i = 0; i < _column_statistics.size(); ++i) {
@@ -44,26 +46,18 @@ public:
     virtual OLAPStatus init() {
         OLAPStatus res = OLAP_SUCCESS;
         for (size_t i = 0; i < _column_statistics.size(); ++i) {
-            _column_statistics[i].first = Field::create(_table->tablet_schema()[i]);
+            _column_statistics[i].first = WrapperField::create(_table->tablet_schema()[i]);
             if (_column_statistics[i].first == NULL) {
                 OLAP_LOG_FATAL("fail to create column statistics field. [field_id=%lu]", i);
                 return OLAP_ERR_MALLOC_ERROR;
             }
-            if (!_column_statistics[i].first->allocate()) {
-                OLAP_LOG_FATAL("fail to allocate column statistics field. [field_id=%lu]", i);
-                return OLAP_ERR_MALLOC_ERROR;
-            }
             _column_statistics[i].first->set_to_max();
 
-            _column_statistics[i].second = Field::create(_table->tablet_schema()[i]);
+            _column_statistics[i].second = WrapperField::create(_table->tablet_schema()[i]);
             if (_column_statistics[i].second == NULL) {
                 OLAP_LOG_FATAL("fail to create column statistics field. [field_id=%lu]", i);
                 return OLAP_ERR_MALLOC_ERROR;
             }
-            if (!_column_statistics[i].second->allocate()) {
-                OLAP_LOG_FATAL("fail to allocate column statistics field. [field_id=%lu]", i);
-                return OLAP_ERR_MALLOC_ERROR;
-            }
             _column_statistics[i].second->set_null();
             _column_statistics[i].second->set_to_min();
         }
@@ -72,18 +66,13 @@ public:
     virtual OLAPStatus attached_by(RowCursor* row_cursor) = 0;
     void next(const RowCursor& row_cursor) {
         for (size_t i = 0; i < _table->num_key_fields(); ++i) {
-            /*
-            if (NULL == row_cursor.get_field_by_index(i)) {
-                _column_statistics[i].first->set_null();
-                continue;
-            }
-            */
-            if (_column_statistics[i].first->cmp(row_cursor.get_field_by_index(i)) > 0) {
-                _column_statistics[i].first->copy(row_cursor.get_field_by_index(i));
+            char* right = row_cursor.get_field_by_index(i)->get_field_ptr(row_cursor.get_buf());
+            if (_column_statistics[i].first->cmp(right) > 0) {
+                _column_statistics[i].first->copy(right);
             }
 
-            if (_column_statistics[i].second->cmp(row_cursor.get_field_by_index(i)) < 0) {
-                _column_statistics[i].second->copy(row_cursor.get_field_by_index(i));
+            if (_column_statistics[i].second->cmp(right) < 0) {
+                _column_statistics[i].second->copy(right);
             }
         }
 
@@ -92,6 +81,7 @@ public:
     virtual OLAPStatus finalize() = 0;
     virtual OLAPStatus write_row_block(RowBlock* row_block) = 0;
     virtual uint64_t written_bytes() = 0;
+    virtual MemPool* mem_pool() = 0;
     // Factory function
     // 调用者获得新建的对象, 并负责delete释放
     static IWriter* create(SmartOLAPTable table, OLAPIndex* index, bool is_push_write);
@@ -99,7 +89,8 @@ public:
 protected:
     bool _is_push_write;
     SmartOLAPTable _table;
-    std::vector > _column_statistics; // first is min, second is max
+    // first is min, second is max
+    std::vector> _column_statistics;
     uint32_t _row_index;
 };
 
@@ -158,6 +149,7 @@ public:
     virtual OLAPStatus finalize();
 
     virtual uint64_t written_bytes();
+    virtual MemPool* mem_pool();
 
 private:
     OLAPIndex* _index;
diff --git a/be/src/rpc/comm.cpp b/be/src/rpc/comm.cpp
index 2281443d54..949bd5c619 100644
--- a/be/src/rpc/comm.cpp
+++ b/be/src/rpc/comm.cpp
@@ -58,6 +58,17 @@ Comm::Comm() {
     m_handler_map = ReactorRunner::handler_map;
 }
 
+Comm::Comm(const char* host) {
+    if (ReactorFactory::ms_reactors.size() == 0) {
+        LOG(ERROR) << "reactor_factory::initialize must be called before creating "
+                   << "rpc::comm object";
+        abort();
+    }
+    InetAddr::initialize(&m_local_addr, host, 0);
+    ReactorFactory::get_timer_reactor(m_timer_reactor);
+    m_handler_map = ReactorRunner::handler_map;
+}
+
 Comm::~Comm() {
     m_handler_map->decomission_all();
     // wait for all decomissioned handlers to get purged by Reactor
@@ -235,8 +246,7 @@ Comm::listen(const CommAddress &addr, ConnectionHandlerFactoryPtr &chf,
     return error;
 }
 
-int
-Comm::send_request(const CommAddress &addr, uint32_t timeout_ms,
+int Comm::send_request(const CommAddress &addr, uint32_t timeout_ms,
                    CommBufPtr &cbuf, DispatchHandler *resp_handler) {
     IOHandlerData *data_handler = 0;
     int error = 0;
diff --git a/be/src/rpc/comm.h b/be/src/rpc/comm.h
index 0d2066a984..54228fc3c4 100644
--- a/be/src/rpc/comm.h
+++ b/be/src/rpc/comm.h
@@ -431,6 +431,8 @@ private:
 
     /** Private constructor (prevent non-singleton usage). */
     Comm();
+    // used for test
+    Comm(const char* host);
 
     /** Destructor */
     ~Comm();
diff --git a/be/src/rpc/comm_buf.h b/be/src/rpc/comm_buf.h
index ecd78a556b..02eda4dde6 100644
--- a/be/src/rpc/comm_buf.h
+++ b/be/src/rpc/comm_buf.h
@@ -138,6 +138,12 @@ public:
         return data_ptr; 
     }
 
+    // get user data ptr and size
+    void get_user_data(const uint8_t** ptr, uint32_t* size) {
+        *ptr = (const uint8_t*)data.base + header.encoded_length();
+        *size = header.total_len - header.encoded_length() - ext.size;
+    }
+
     /** Returns address of the primary buffer internal data pointer
     */
     uint8_t** get_data_ptr_address() { 
diff --git a/be/src/rpc/command.h b/be/src/rpc/command.h
new file mode 100644
index 0000000000..7857454b99
--- /dev/null
+++ b/be/src/rpc/command.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2018, Baidu.com, Inc. All Rights Reserved
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+namespace palo {
+
+#define RPC_COMMAND_TRANSMIT_DATA           0
+#define RPC_COMMAND_TABLET_WRITER_OPEN      1
+#define RPC_COMMAND_TABLET_WRITER_ADD_BATCH 2
+#define RPC_COMMAND_TABLET_WRITER_CLOSE     3
+#define RPC_COMMAND_TABLET_WRITER_CANCEL    4
+
+}
diff --git a/be/src/rpc/connection_manager.cpp b/be/src/rpc/connection_manager.cpp
index d48a7522ee..67c828ab30 100644
--- a/be/src/rpc/connection_manager.cpp
+++ b/be/src/rpc/connection_manager.cpp
@@ -335,8 +335,7 @@ int ConnectionManager::remove_unlocked(const CommAddress &addr) {
  *
  * @param event shared pointer to event object
  */
-void
-ConnectionManager::handle(EventPtr &event) {
+void ConnectionManager::handle(EventPtr &event) {
     std::lock_guard lock(m_impl->mutex);
     ConnectionStatePtr conn_state;
     {
@@ -414,8 +413,8 @@ ConnectionManager::handle(EventPtr &event) {
             conn_state->handler->handle(event);
     }
     else {
-        LOG(WARNING) << "unable to find connection in map."
-                     << "[addr=" << InetAddr::format(event->addr).c_str() << "]";
+        LOG(WARNING) << "unable to find connection in map. addr="
+            << InetAddr::format(event->addr);
     }
 }
 
@@ -438,11 +437,10 @@ void ConnectionManager::send_initialization_request(ConnectionStatePtr &conn_sta
 void ConnectionManager::schedule_retry(ConnectionStatePtr &conn_state,
                                        const std::string &message) {
     if (!m_impl->quiet_mode) {
-        LOG(ERROR) << "connection error, will retry in " << (int)conn_state->timeout_ms << " milliseconds."
-                   << "[client_addr=" << inet_ntoa(conn_state->addr.inet.sin_addr) << ", "
-                   << "client_port=" << ntohs(conn_state->addr.inet.sin_port) << ", "
-                   << "error=" << message.c_str() << ", "
-                   << "service_name" << conn_state->service_name.c_str() << "]";
+        LOG(ERROR) << "connection falied, retry in " << conn_state->timeout_ms << "ms."
+                   << " addr=" << conn_state->addr.to_str()
+                   << ", error=" << message
+                   << ", service_name=" << conn_state->service_name;
     }
     // this logic could proably be smarter.  For example, if the last
     // connection attempt was a long time ago, then schedule immediately
diff --git a/be/src/rpc/connection_manager.h b/be/src/rpc/connection_manager.h
index cae973d73a..90ddfaf61d 100644
--- a/be/src/rpc/connection_manager.h
+++ b/be/src/rpc/connection_manager.h
@@ -325,7 +325,7 @@ class ConnectionManager : public DispatchHandler {
      * which the event was generated.
      * @param event Comm layer event
      */
-    virtual void handle(EventPtr &event);
+    void handle(EventPtr& event) override;
 
     /** Connect retry loop.
      * This method is called as the retry thread function.
diff --git a/be/src/rpc/dispatch_handler.h b/be/src/rpc/dispatch_handler.h
index 501c56e645..356012d359 100644
--- a/be/src/rpc/dispatch_handler.h
+++ b/be/src/rpc/dispatch_handler.h
@@ -28,17 +28,17 @@ class IOHandler;
  * is notified of communication events.
  */
 class DispatchHandler : public std::enable_shared_from_this {
-    public:
-        /** Destructor
-        */
-        virtual ~DispatchHandler() { return; }
-        /** Callback method.  When the Comm layer needs to deliver an event to the
-         * application, this method is called to do so.  The set of event types
-         * include, CONNECTION_ESTABLISHED, DISCONNECT, MESSAGE, ERROR, and TIMER.
-         *
-         * @param event_ptr smart pointer to Event object
-         */
-        virtual void handle(EventPtr &event_ptr) = 0;
+public:
+    /** Destructor
+    */
+    virtual ~DispatchHandler() { }
+    /** Callback method.  When the Comm layer needs to deliver an event to the
+     * application, this method is called to do so.  The set of event types
+     * include, CONNECTION_ESTABLISHED, DISCONNECT, MESSAGE, ERROR, and TIMER.
+     *
+     * @param event_ptr smart pointer to Event object
+     */
+    virtual void handle(EventPtr& event_ptr) = 0;
 };
 
 /// Smart pointer to DispatchHandler
diff --git a/be/src/rpc/inet_addr.cpp b/be/src/rpc/inet_addr.cpp
index c8604e6911..41d1b100a2 100644
--- a/be/src/rpc/inet_addr.cpp
+++ b/be/src/rpc/inet_addr.cpp
@@ -67,9 +67,8 @@ bool InetAddr::initialize(sockaddr_in *addr, const char *host, uint16_t port) {
         struct hostent *he = 0;
         char hbuf[2048];
         int err = 0;
-        if (gethostbyname_r(host, &hent, hbuf, sizeof(hbuf), &he, &err) != 0
-                || he == 0) {
-            LOG(ERROR) << "gethostbyname '%s': error: %d" << host << err;
+        if (gethostbyname_r(host, &hent, hbuf, sizeof(hbuf), &he, &err) != 0 || he == 0) {
+            LOG(ERROR) << "gethostbyname host=" << host << ", error: " << err;
             return false;
         }
         memcpy(&addr->sin_addr.s_addr, he->h_addr_list[0], sizeof(uint32_t));
diff --git a/be/src/rpc/io_handler.h b/be/src/rpc/io_handler.h
index 8a07ed185a..5210effa0f 100644
--- a/be/src/rpc/io_handler.h
+++ b/be/src/rpc/io_handler.h
@@ -22,6 +22,7 @@
 #include "reactor_factory.h"
 #include "expire_timer.h"
 #include "common/logging.h"
+#include "util/debug_util.h"
 
 #include 
 
@@ -113,7 +114,7 @@ public:
             dh->handle(event);
         } else {
             if (!m_dispatch_handler) {
-                LOG(INFO) << "event: " << event->to_str().c_str();
+                LOG(INFO) << "event: " << event->to_str();
             } else {
                 m_dispatch_handler->handle(event);
             }
diff --git a/be/src/rpc/io_handler_data.cpp b/be/src/rpc/io_handler_data.cpp
index e088d256ff..cc73adf2d0 100644
--- a/be/src/rpc/io_handler_data.cpp
+++ b/be/src/rpc/io_handler_data.cpp
@@ -225,7 +225,7 @@ void IOHandlerData::handle_message_header(ClockT::time_point arrival_time) {
 }
 
 void IOHandlerData::handle_message_body() {
-    DispatchHandler *dh {};
+    DispatchHandlerPtr dhp;
     if (m_event->header.flags & CommHeader::FLAGS_BIT_PROXY_MAP_UPDATE) {
         ReactorRunner::handler_map->update_proxy_map((const char *)m_message,
                 m_event->header.total_len - m_event->header.header_len);
@@ -234,7 +234,7 @@ void IOHandlerData::handle_message_body() {
     }
     else if ((m_event->header.flags & CommHeader::FLAGS_BIT_REQUEST) == 0 &&
              (m_event->header.id == 0
-              || !m_reactor->remove_request(m_event->header.id, dh))) {
+              || !m_reactor->remove_request(m_event->header.id, dhp))) {
         if ((m_event->header.flags & CommHeader::FLAGS_BIT_IGNORE_RESPONSE) == 0) {
             LOG(WARNING) << "received response for non-pending event."
                          << "[request_id=" << m_event->header.id << ", "
@@ -253,7 +253,7 @@ void IOHandlerData::handle_message_body() {
             std::lock_guard lock(m_mutex);
             m_event->set_proxy(m_proxy);
         }
-        deliver_event(m_event, dh);
+        deliver_event(m_event, dhp.get());
     }
     reset_incoming_message_state();
 }
diff --git a/be/src/rpc/reactor.cpp b/be/src/rpc/reactor.cpp
index 5ffc6f8455..3eff08b107 100644
--- a/be/src/rpc/reactor.cpp
+++ b/be/src/rpc/reactor.cpp
@@ -124,9 +124,9 @@ void Reactor::handle_timeouts(PollTimeout &next_timeout) {
         {
             std::lock_guard lock(m_mutex);
             IOHandler *handler = 0;
-            DispatchHandler *dh = 0;
+            DispatchHandlerPtr dhp;
             now = ClockT::now();
-            while (m_request_cache.get_next_timeout(now, handler, dh,
+            while (m_request_cache.get_next_timeout(now, handler, dhp,
                                                     &next_req_timeout, &header_id)) {
                 event = std::make_shared(
                             Event::ERROR,
@@ -134,7 +134,7 @@ void Reactor::handle_timeouts(PollTimeout &next_timeout) {
                             error::REQUEST_TIMEOUT);
                 event->set_proxy(((IOHandlerData *)handler)->get_proxy());
                 event->header.id = header_id;
-                handler->deliver_event(event, dh);
+                handler->deliver_event(event, dhp.get());
             }
             if (next_req_timeout != ClockT::time_point()) {
                 next_timeout.set(now, next_req_timeout);
diff --git a/be/src/rpc/reactor.h b/be/src/rpc/reactor.h
index 9ee567665f..985648add5 100644
--- a/be/src/rpc/reactor.h
+++ b/be/src/rpc/reactor.h
@@ -83,7 +83,7 @@ public:
      * @param handler Removed dispatch handler
      * @return true if request removed, false otherwise
      */
-    bool remove_request(uint32_t id, DispatchHandler *&handler) {
+    bool remove_request(uint32_t id, DispatchHandlerPtr& handler) {
         std::lock_guard lock(m_mutex);
         return m_request_cache.remove(id, handler);
     }
diff --git a/be/src/rpc/request_cache.cpp b/be/src/rpc/request_cache.cpp
index 38e5e1613a..761057e668 100644
--- a/be/src/rpc/request_cache.cpp
+++ b/be/src/rpc/request_cache.cpp
@@ -26,8 +26,7 @@ void
 RequestCache::insert(uint32_t id, IOHandler *handler, DispatchHandler *dh,
                      ClockT::time_point &expire) {
     VLOG(3) << "Adding id %d" << id;
-    IdHandlerMap::iterator iter = m_id_map.find(id);
-    assert(iter == m_id_map.end());
+    assert(m_id_map.find(id) == m_id_map.end());
     CacheNode *node = new CacheNode(id, handler, dh);
     node->expire = expire;
     if (m_head == 0) {
@@ -43,7 +42,7 @@ RequestCache::insert(uint32_t id, IOHandler *handler, DispatchHandler *dh,
     m_id_map[id] = node;
 }
 
-bool RequestCache::remove(uint32_t id, DispatchHandler *&handler) {
+bool RequestCache::remove(uint32_t id, DispatchHandlerPtr& handler) {
     VLOG(3) << "remove request_id from request_cache. [request_id=" << id << "]";
     IdHandlerMap::iterator iter = m_id_map.find(id);
     if (iter == m_id_map.end()) {
@@ -64,13 +63,13 @@ bool RequestCache::remove(uint32_t id, DispatchHandler *&handler) {
         node->next->prev = node->prev;
     }
     m_id_map.erase(iter);
-    handler = node->dh;
+    handler = node->dhp;
     delete node;
     return true;
 }
 
 bool RequestCache::get_next_timeout(ClockT::time_point &now, IOHandler *&handlerp,
-                                    DispatchHandler *&dh,
+                                    DispatchHandlerPtr& dhp,
                                     ClockT::time_point *next_timeout, uint32_t* header_id) {
     bool handler_removed = false;
     while (m_head && !handler_removed && m_head->expire <= now) {
@@ -86,7 +85,7 @@ bool RequestCache::get_next_timeout(ClockT::time_point &now, IOHandler *&handler
         m_id_map.erase(iter);
         if (node->handler != 0) {
             handlerp = node->handler;
-            dh = node->dh;
+            dhp = node->dhp;
             *header_id = node->id;
             handler_removed = true;
         }
@@ -112,7 +111,7 @@ void RequestCache::purge_requests(IOHandler *handler, int32_t error) {
             } else {
                 event = std::make_shared(Event::ERROR, handler->get_address(), proxy, error);
             }
-            handler->deliver_event(event, node->dh);
+            handler->deliver_event(event, node->dhp.get());
             node->handler = 0;  // mark for deletion
         }
     }
diff --git a/be/src/rpc/request_cache.h b/be/src/rpc/request_cache.h
index 1a47ebe709..96b0e6adeb 100644
--- a/be/src/rpc/request_cache.h
+++ b/be/src/rpc/request_cache.h
@@ -38,7 +38,11 @@ class RequestCache {
     class CacheNode {
     public:
         CacheNode(uint32_t id, IOHandler *handler, DispatchHandler *dh)
-            : id(id), handler(handler), dh(dh) {}
+                : id(id), handler(handler) {
+            if (dh != nullptr) {
+                dhp = dh->shared_from_this();
+            }
+        }
         ~CacheNode() {}
         CacheNode* prev;            //!< Doubly-linked list prev pointers
         CacheNode* next;            //!< Doubly-linked list next pointers
@@ -47,7 +51,7 @@ class RequestCache {
         IOHandler         *handler; //!< IOHandler associated with this request
         /// Callback handler to which MESSAGE, TIMEOUT, ERROR, and DISCONNECT
         /// events are delivered
-        DispatchHandler *dh;
+        DispatchHandlerPtr dhp;
     };
 
     /// RequestID-to-CacheNode map
@@ -73,7 +77,7 @@ public:
      * @param handler Removed dispatch handler
      * @return true if removed, false if not found
      */
-    bool remove(uint32_t id, DispatchHandler *&handler);
+    bool remove(uint32_t id, DispatchHandlerPtr &handler);
 
     /** Removes next request that has timed out.  This method finds the first
      * request starting from the head of the list and removes it and returns
@@ -90,7 +94,7 @@ public:
      * false otherwise
      */
     bool get_next_timeout(ClockT::time_point &now, IOHandler *&handlerp,
-                          DispatchHandler *&dh,
+                          DispatchHandlerPtr& dhp,
                           ClockT::time_point *next_timeout, uint32_t* header_id);
 
     /** Purges all requests assocated with handler.  This
diff --git a/be/src/rpc/static_buffer.h b/be/src/rpc/static_buffer.h
index cfec10678f..f023b4f254 100644
--- a/be/src/rpc/static_buffer.h
+++ b/be/src/rpc/static_buffer.h
@@ -46,10 +46,11 @@ public:
     explicit StaticBuffer(size_t len, size_t alignment=0)
         : alignment(alignment), size(len), own(true) {
             if (alignment > 0) {
-                void *vptr = 0;
+                void *vptr = nullptr;
                 size_t aligned_len = (len % alignment) == 0 ? len :
                     ((len / alignment)+1)*alignment;
-                assert(posix_memalign(&vptr, alignment, aligned_len) == 0);
+                posix_memalign(&vptr, alignment, aligned_len);
+                assert(vptr != nullptr);
                 base = (uint8_t *)vptr;
             }
             else
diff --git a/be/src/runtime/CMakeLists.txt b/be/src/runtime/CMakeLists.txt
index 16a39d08e7..48ffb79052 100644
--- a/be/src/runtime/CMakeLists.txt
+++ b/be/src/runtime/CMakeLists.txt
@@ -73,7 +73,7 @@ add_library(Runtime STATIC
   tmp_file_mgr.cc
   disk_io_mgr.cc
   disk_io_mgr_reader_context.cc
-  disk_io_mgr_scan_range.cc
+  disk_io_mgr_scan_range.cc 
   buffered_block_mgr2.cc
   test_env.cc
   mem_tracker.cpp
@@ -82,8 +82,16 @@ add_library(Runtime STATIC
   data_stream_recvr.cc
   buffered_tuple_stream2.cc
   buffered_tuple_stream2_ir.cc
+  buffered_tuple_stream3.cc
   #  export_task_mgr.cpp
   export_sink.cpp
+  bufferpool/buffer_allocator.cc
+  bufferpool/buffer_pool.cc
+  bufferpool/reservation_tracker.cc
+  bufferpool/reservation_util.cc
+  bufferpool/suballocator.cc
+  bufferpool/system_allocator.cc
+  initial_reservations.cc
 )
 
 # This test runs forever so should not be part of 'make test'
diff --git a/be/src/runtime/buffered_block_mgr2.cc b/be/src/runtime/buffered_block_mgr2.cc
index adde15b975..1d594be5c2 100644
--- a/be/src/runtime/buffered_block_mgr2.cc
+++ b/be/src/runtime/buffered_block_mgr2.cc
@@ -20,6 +20,7 @@
 
 #include "runtime/buffered_block_mgr2.h"
 
+#include "runtime/exec_env.h"
 #include "runtime/runtime_state.h"
 #include "runtime/mem_tracker.h"
 #include "runtime/mem_pool.h"
@@ -31,6 +32,7 @@
 #include "util/palo_metrics.h"
 #include "util/debug_util.h"
 #include "util/uid_util.h"
+#include "util/pretty_printer.h"
 
 using std::string;
 using std::stringstream;
@@ -218,7 +220,7 @@ BufferedBlockMgr2::BufferedBlockMgr2(RuntimeState* state, TmpFileMgr* tmp_file_m
     _unfullfilled_reserved_buffers(0),
     _total_pinned_buffers(0),
     _non_local_outstanding_writes(0),
-    _io_mgr(state->io_mgr()),
+    _io_mgr(state->exec_env()->disk_io_mgr()),
     _is_cancelled(false),
     _writes_issued(0) {
 }
@@ -250,7 +252,7 @@ Status BufferedBlockMgr2::create(
             // _s_query_to_block_mgrs[state->query_id()] = *block_mgr;
         }
     }
-    (*block_mgr)->init(state->io_mgr(), profile, parent, mem_limit);
+    (*block_mgr)->init(state->exec_env()->disk_io_mgr(), profile, parent, mem_limit);
     return Status::OK;
 }
 
@@ -796,9 +798,11 @@ Status BufferedBlockMgr2::write_unpinned_block(Block* block) {
     _bytes_written_counter->update(block->_valid_data_len);
     ++_writes_issued;
     if (_writes_issued == 1) {
+#if 0
         if (PaloMetrics::num_queries_spilled() != NULL) {
             PaloMetrics::num_queries_spilled()->increment(1);
         }
+#endif
     }
     return Status::OK;
 }
diff --git a/be/src/runtime/buffered_tuple_stream.cpp b/be/src/runtime/buffered_tuple_stream.cpp
index 2ffb72d076..78ac768e51 100644
--- a/be/src/runtime/buffered_tuple_stream.cpp
+++ b/be/src/runtime/buffered_tuple_stream.cpp
@@ -28,6 +28,7 @@
 #include "runtime/tuple_row.h"
 #include "util/bit_util.h"
 #include "util/debug_util.h"
+#include "util/pretty_printer.h"
 #include "common/status.h"
 
 namespace palo {
@@ -374,7 +375,7 @@ Status BufferedTupleStream::get_next_internal(RowBatch* batch, bool* eos,
                 continue;
             }
 
-            DCHECK_NOTNULL(tuple);
+            DCHECK(tuple != nullptr);
 
             for (int k = 0; k < _string_slots[j].second.size(); ++k) {
                 const SlotDescriptor* slot_desc = _string_slots[j].second[k];
@@ -423,7 +424,7 @@ int BufferedTupleStream::compute_row_size(TupleRow* row) const {
             continue;
         }
 
-        DCHECK_NOTNULL(tuple);
+        DCHECK(tuple != nullptr);
 
         for (int j = 0; j < _string_slots[i].second.size(); ++j) {
             const SlotDescriptor* slot_desc = _string_slots[i].second[j];
diff --git a/be/src/runtime/buffered_tuple_stream2.cc b/be/src/runtime/buffered_tuple_stream2.cc
index 9dc2666489..c49bd7407f 100644
--- a/be/src/runtime/buffered_tuple_stream2.cc
+++ b/be/src/runtime/buffered_tuple_stream2.cc
@@ -27,6 +27,7 @@
 #include "runtime/tuple_row.h"
 #include "util/bit_util.h"
 #include "util/debug_util.h"
+#include "util/pretty_printer.h"
 
 using std::stringstream;
 using std::string;
diff --git a/be/src/runtime/buffered_tuple_stream3.cc b/be/src/runtime/buffered_tuple_stream3.cc
new file mode 100644
index 0000000000..23eeea890a
--- /dev/null
+++ b/be/src/runtime/buffered_tuple_stream3.cc
@@ -0,0 +1,1113 @@
+// Modifications copyright (C) 2017, Baidu.com, Inc.
+// Copyright 2017 The Apache Software Foundation
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "runtime/buffered_tuple_stream3.inline.h"
+
+#include 
+#include 
+
+#include "runtime/bufferpool/reservation_tracker.h"
+//#include "runtime/collection_value.h"
+#include "runtime/descriptors.h"
+#include "runtime/exec_env.h"
+#include "runtime/mem_tracker.h"
+#include "runtime/row_batch.h"
+#include "runtime/runtime_state.h"
+#include "runtime/string_value.h"
+#include "runtime/tuple_row.h"
+#include "util/bit_util.h"
+#include "util/debug_util.h"
+#include "util/pretty_printer.h"
+#include "util/runtime_profile.h"
+
+#include "common/names.h"
+
+#ifdef NDEBUG
+#define CHECK_CONSISTENCY_FAST()
+#define CHECK_CONSISTENCY_FULL()
+#else
+#define CHECK_CONSISTENCY_FAST() CheckConsistencyFast()
+#define CHECK_CONSISTENCY_FULL() CheckConsistencyFull()
+#endif
+
+using namespace palo;
+using namespace strings;
+
+using BufferHandle = BufferPool::BufferHandle;
+
+BufferedTupleStream3::BufferedTupleStream3(RuntimeState* state,
+    const RowDescriptor* row_desc, BufferPool::ClientHandle* buffer_pool_client,
+    int64_t default_page_len, int64_t max_page_len, const set& ext_varlen_slots)
+  : state_(state),
+    desc_(row_desc),
+    node_id_(-1),
+    buffer_pool_(state->exec_env()->buffer_pool()),
+    buffer_pool_client_(buffer_pool_client),
+    num_pages_(0),
+    total_byte_size_(0),
+    has_read_iterator_(false),
+    read_page_reservation_(buffer_pool_client_),
+    read_page_rows_returned_(-1),
+    read_ptr_(nullptr),
+    read_end_ptr_(nullptr),
+    write_ptr_(nullptr),
+    write_end_ptr_(nullptr),
+    rows_returned_(0),
+    has_write_iterator_(false),
+    write_page_(nullptr),
+    write_page_reservation_(buffer_pool_client_),
+    bytes_pinned_(0),
+    num_rows_(0),
+    default_page_len_(default_page_len),
+    max_page_len_(max_page_len),
+    has_nullable_tuple_(row_desc->is_any_tuple_nullable()),
+    delete_on_read_(false),
+    closed_(false),
+    pinned_(true) {
+  DCHECK_GE(max_page_len, default_page_len);
+  DCHECK(BitUtil::IsPowerOf2(default_page_len)) << default_page_len;
+  DCHECK(BitUtil::IsPowerOf2(max_page_len)) << max_page_len;
+  read_page_ = pages_.end();
+  for (int i = 0; i < desc_->tuple_descriptors().size(); ++i) {
+    const TupleDescriptor* tuple_desc = desc_->tuple_descriptors()[i];
+    const int tuple_byte_size = tuple_desc->byte_size();
+    fixed_tuple_sizes_.push_back(tuple_byte_size);
+
+    vector tuple_string_slots;
+    vector tuple_coll_slots;
+    for (int j = 0; j < tuple_desc->slots().size(); ++j) {
+      SlotDescriptor* slot = tuple_desc->slots()[j];
+      if (!slot->type().is_var_len_string_type()) continue;
+      if (ext_varlen_slots.find(slot->id()) == ext_varlen_slots.end()) {
+        if (slot->type().is_var_len_string_type()) {
+          tuple_string_slots.push_back(slot);
+        } else {
+          DCHECK(slot->type().is_collection_type());
+          tuple_coll_slots.push_back(slot);
+        }
+      }
+    }
+    if (!tuple_string_slots.empty()) {
+      inlined_string_slots_.push_back(make_pair(i, tuple_string_slots));
+    }
+/*
+    if (!tuple_coll_slots.empty()) {
+      inlined_coll_slots_.push_back(make_pair(i, tuple_coll_slots));
+    }
+*/
+  }
+}
+
+BufferedTupleStream3::~BufferedTupleStream3() {
+  DCHECK(closed_);
+}
+
+void BufferedTupleStream3::CheckConsistencyFull() const {
+  CheckConsistencyFast();
+  // The below checks require iterating over all the pages in the stream.
+  DCHECK_EQ(bytes_pinned_, CalcBytesPinned()) << DebugString();
+  DCHECK_EQ(pages_.size(), num_pages_) << DebugString();
+  for (const Page& page : pages_) CheckPageConsistency(&page);
+}
+
+void BufferedTupleStream3::CheckConsistencyFast() const {
+  // All the below checks should be O(1).
+  DCHECK(has_write_iterator() || write_page_ == nullptr);
+  if (write_page_ != nullptr) {
+    CheckPageConsistency(write_page_);
+    DCHECK(write_page_->is_pinned());
+    DCHECK(write_page_->retrieved_buffer);
+    const BufferHandle* write_buffer;
+    Status status = write_page_->GetBuffer(&write_buffer);
+    DCHECK(status.ok()); // Write buffer should never have been unpinned.
+    DCHECK_GE(write_ptr_, write_buffer->data());
+    DCHECK_EQ(write_end_ptr_, write_buffer->data() + write_page_->len());
+    DCHECK_GE(write_end_ptr_, write_ptr_);
+  }
+  DCHECK(has_read_iterator() || read_page_ == pages_.end());
+  if (read_page_ != pages_.end()) {
+    CheckPageConsistency(&*read_page_);
+    DCHECK(read_page_->is_pinned());
+    DCHECK(read_page_->retrieved_buffer);
+    // Can't check read buffer without affecting behaviour, because a read may be in
+    // flight and this would required blocking on that write.
+    DCHECK_GE(read_end_ptr_, read_ptr_);
+  }
+  if (NeedReadReservation()) {
+    DCHECK_EQ(default_page_len_, read_page_reservation_.GetReservation())
+        << DebugString();
+  } else if (!read_page_reservation_.is_closed()) {
+    DCHECK_EQ(0, read_page_reservation_.GetReservation());
+  }
+  if (NeedWriteReservation()) {
+    DCHECK_EQ(default_page_len_, write_page_reservation_.GetReservation());
+  } else if (!write_page_reservation_.is_closed()) {
+    DCHECK_EQ(0, write_page_reservation_.GetReservation());
+  }
+}
+
+void BufferedTupleStream3::CheckPageConsistency(const Page* page) const {
+  DCHECK_EQ(ExpectedPinCount(pinned_, page), page->pin_count()) << DebugString();
+  // Only one large row per page.
+  if (page->len() > default_page_len_) DCHECK_LE(page->num_rows, 1);
+  // We only create pages when we have a row to append to them.
+  DCHECK_GT(page->num_rows, 0);
+}
+
+string BufferedTupleStream3::DebugString() const {
+  stringstream ss;
+  ss << "BufferedTupleStream3 num_rows=" << num_rows_
+     << " rows_returned=" << rows_returned_ << " pinned=" << pinned_
+     << " delete_on_read=" << delete_on_read_ << " closed=" << closed_ << "\n"
+     << " bytes_pinned=" << bytes_pinned_ << " has_write_iterator=" << has_write_iterator_
+     << " write_page=" << write_page_ << " has_read_iterator=" << has_read_iterator_
+     << " read_page=";
+  if (read_page_ == pages_.end()) {
+    ss << "";
+  } else {
+    ss << &*read_page_;
+  }
+  ss << "\n"
+     << " read_page_reservation=";
+  if (read_page_reservation_.is_closed()) {
+    ss << "";
+  } else {
+    ss << read_page_reservation_.GetReservation();
+  }
+  ss << " write_page_reservation=";
+  if (write_page_reservation_.is_closed()) {
+    ss << "";
+  } else {
+    ss << write_page_reservation_.GetReservation();
+  }
+  ss << "\n # pages=" << num_pages_ << " pages=[\n";
+  for (const Page& page : pages_) {
+    ss << "{" << page.DebugString() << "}";
+    if (&page != &pages_.back()) ss << ",\n";
+  }
+  ss << "]";
+  return ss.str();
+}
+
+string BufferedTupleStream3::Page::DebugString() const {
+  //return Substitute("$0 num_rows=$1", handle.DebugString(), num_rows);
+  return string("");
+}
+
+Status BufferedTupleStream3::Init(int node_id, bool pinned) {
+//  if (!pinned) UnpinStream(UNPIN_ALL_EXCEPT_CURRENT);
+  node_id_ = node_id;
+  return Status::OK;
+}
+
+Status BufferedTupleStream3::PrepareForWrite(bool* got_reservation) {
+  // This must be the first iterator created.
+  DCHECK(pages_.empty());
+  DCHECK(!delete_on_read_);
+  DCHECK(!has_write_iterator());
+  DCHECK(!has_read_iterator());
+  CHECK_CONSISTENCY_FULL();
+
+  *got_reservation = buffer_pool_client_->IncreaseReservationToFit(default_page_len_);
+  if (!*got_reservation) return Status::OK;
+  has_write_iterator_ = true;
+  // Save reservation for the write iterators.
+  buffer_pool_client_->SaveReservation(&write_page_reservation_, default_page_len_);
+  CHECK_CONSISTENCY_FULL();
+  return Status::OK;
+}
+
+Status BufferedTupleStream3::PrepareForReadWrite(
+    bool delete_on_read, bool* got_reservation) {
+  // This must be the first iterator created.
+  DCHECK(pages_.empty());
+  DCHECK(!delete_on_read_);
+  DCHECK(!has_write_iterator());
+  DCHECK(!has_read_iterator());
+  CHECK_CONSISTENCY_FULL();
+
+  *got_reservation = buffer_pool_client_->IncreaseReservationToFit(2 * default_page_len_);
+  if (!*got_reservation) return Status::OK;
+  has_write_iterator_ = true;
+  // Save reservation for both the read and write iterators.
+  buffer_pool_client_->SaveReservation(&read_page_reservation_, default_page_len_);
+  buffer_pool_client_->SaveReservation(&write_page_reservation_, default_page_len_);
+  RETURN_IF_ERROR(PrepareForReadInternal(delete_on_read));
+  return Status::OK;
+}
+
+void BufferedTupleStream3::Close(RowBatch* batch, RowBatch::FlushMode flush) {
+  for (Page& page : pages_) {
+    if (batch != nullptr && page.retrieved_buffer) {
+      // Subtle: We only need to attach buffers from pages that we may have returned
+      // references to. ExtractBuffer() cannot fail for these pages because the data
+      // is guaranteed to already be in -memory.
+      BufferPool::BufferHandle buffer;
+      Status status = buffer_pool_->ExtractBuffer(buffer_pool_client_, &page.handle, &buffer);
+      DCHECK(status.ok());
+      batch->add_buffer(buffer_pool_client_, move(buffer), flush);
+    } else {
+      buffer_pool_->DestroyPage(buffer_pool_client_, &page.handle);
+    }
+  }
+  read_page_reservation_.Close();
+  write_page_reservation_.Close();
+  pages_.clear();
+  num_pages_ = 0;
+  bytes_pinned_ = 0;
+  closed_ = true;
+}
+
+int64_t BufferedTupleStream3::CalcBytesPinned() const {
+  int64_t result = 0;
+  for (const Page& page : pages_) result += page.pin_count() * page.len();
+  return result;
+}
+
+Status BufferedTupleStream3::PinPage(Page* page) {
+  RETURN_IF_ERROR(buffer_pool_->Pin(buffer_pool_client_, &page->handle));
+  bytes_pinned_ += page->len();
+  return Status::OK;
+}
+
+int BufferedTupleStream3::ExpectedPinCount(bool stream_pinned, const Page* page) const {
+  return (stream_pinned || is_read_page(page) || is_write_page(page)) ? 1 : 0;
+}
+
+Status BufferedTupleStream3::PinPageIfNeeded(Page* page, bool stream_pinned) {
+  int new_pin_count = ExpectedPinCount(stream_pinned, page);
+  if (new_pin_count != page->pin_count()) {
+    DCHECK_EQ(new_pin_count, page->pin_count() + 1);
+    RETURN_IF_ERROR(PinPage(page));
+  }
+  return Status::OK;
+}
+
+void BufferedTupleStream3::UnpinPageIfNeeded(Page* page, bool stream_pinned) {
+  int new_pin_count = ExpectedPinCount(stream_pinned, page);
+  if (new_pin_count != page->pin_count()) {
+    DCHECK_EQ(new_pin_count, page->pin_count() - 1);
+    buffer_pool_->Unpin(buffer_pool_client_, &page->handle);
+    bytes_pinned_ -= page->len();
+    if (page->pin_count() == 0) page->retrieved_buffer = false;
+  }
+}
+
+bool BufferedTupleStream3::NeedWriteReservation() const {
+  return NeedWriteReservation(pinned_);
+}
+
+bool BufferedTupleStream3::NeedWriteReservation(bool stream_pinned) const {
+  return NeedWriteReservation(stream_pinned, num_pages_, has_write_iterator(),
+      write_page_ != nullptr, has_read_write_page());
+}
+
+bool BufferedTupleStream3::NeedWriteReservation(bool stream_pinned, int64_t num_pages,
+    bool has_write_iterator, bool has_write_page, bool has_read_write_page) {
+  if (!has_write_iterator) return false;
+  // If the stream is empty the write reservation hasn't been used yet.
+  if (num_pages == 0) return true;
+  if (stream_pinned) {
+    // Make sure we've saved the write reservation for the next page if the only
+    // page is a read/write page.
+    return has_read_write_page && num_pages == 1;
+  } else {
+    // Make sure we've saved the write reservation if it's not being used to pin
+    // a page in the stream.
+    return !has_write_page || has_read_write_page;
+  }
+}
+
+bool BufferedTupleStream3::NeedReadReservation() const {
+  return NeedReadReservation(pinned_);
+}
+
+bool BufferedTupleStream3::NeedReadReservation(bool stream_pinned) const {
+  return NeedReadReservation(
+      stream_pinned, num_pages_, has_read_iterator(), read_page_ != pages_.end());
+}
+
+bool BufferedTupleStream3::NeedReadReservation(bool stream_pinned, int64_t num_pages,
+    bool has_read_iterator, bool has_read_page) const {
+  return NeedReadReservation(stream_pinned, num_pages, has_read_iterator, has_read_page,
+      has_write_iterator(), write_page_ != nullptr);
+}
+
+bool BufferedTupleStream3::NeedReadReservation(bool stream_pinned, int64_t num_pages,
+    bool has_read_iterator, bool has_read_page, bool has_write_iterator,
+    bool has_write_page) {
+  if (!has_read_iterator) return false;
+  if (stream_pinned) {
+    // Need reservation if there are no pages currently pinned for reading but we may add
+    // a page.
+    return num_pages == 0 && has_write_iterator;
+  } else {
+    // Only need to save reservation for an unpinned stream if there is no read page
+    // and we may advance to one in the future.
+    return (has_write_iterator || num_pages > 0) && !has_read_page;
+  }
+}
+
+Status BufferedTupleStream3::NewWritePage(int64_t page_len) noexcept {
+  DCHECK(!closed_);
+  DCHECK(write_page_ == nullptr);
+
+  Page new_page;
+  const BufferHandle* write_buffer;
+  RETURN_IF_ERROR(buffer_pool_->CreatePage(
+      buffer_pool_client_, page_len, &new_page.handle, &write_buffer));
+  bytes_pinned_ += page_len;
+  total_byte_size_ += page_len;
+
+  pages_.push_back(std::move(new_page));
+  ++num_pages_;
+  write_page_ = &pages_.back();
+  DCHECK_EQ(write_page_->num_rows, 0);
+  write_ptr_ = write_buffer->data();
+  write_end_ptr_ = write_ptr_ + page_len;
+  return Status::OK;
+}
+
+Status BufferedTupleStream3::CalcPageLenForRow(int64_t row_size, int64_t* page_len) {
+  if (UNLIKELY(row_size > max_page_len_)) {
+    std::stringstream ss;
+    ss << " execeed max row size, row size:" 
+       << PrettyPrinter::print(row_size, TUnit::BYTES) 
+       << " node id:" << node_id_;
+       //<< " query option max row size:" 
+       //<< PrettyPrinter::print
+       //    (state_->query_options().max_row_size, TUnit::BYTES);
+    return Status(ss.str());
+  }
+  *page_len = max(default_page_len_, BitUtil::RoundUpToPowerOfTwo(row_size));
+  return Status::OK;
+}
+
+Status BufferedTupleStream3::AdvanceWritePage(
+    int64_t row_size, bool* got_reservation) noexcept {
+  DCHECK(has_write_iterator());
+  CHECK_CONSISTENCY_FAST();
+
+  int64_t page_len;
+
+  Status status = CalcPageLenForRow(row_size, &page_len);
+  if (!status.ok()) {
+     return status;
+  }
+
+  // Reservation may have been saved for the next write page, e.g. by PrepareForWrite()
+  // if the stream is empty.
+  int64_t write_reservation_to_restore = 0, read_reservation_to_restore = 0;
+  if (NeedWriteReservation(
+          pinned_, num_pages_, true, write_page_ != nullptr, has_read_write_page())
+      && !NeedWriteReservation(pinned_, num_pages_ + 1, true, true, false)) {
+    write_reservation_to_restore = default_page_len_;
+  }
+  // If the stream is pinned, we need to keep the previous write page pinned for reading.
+  // Check if we saved reservation for this case.
+  if (NeedReadReservation(pinned_, num_pages_, has_read_iterator(),
+          read_page_ != pages_.end(), true, write_page_ != nullptr)
+      && !NeedReadReservation(pinned_, num_pages_ + 1, has_read_iterator(),
+             read_page_ != pages_.end(), true, true)) {
+    read_reservation_to_restore = default_page_len_;
+  }
+
+  // We may reclaim reservation by unpinning a page that was pinned for writing.
+  int64_t write_page_reservation_to_reclaim =
+      (write_page_ != nullptr && !pinned_ && !has_read_write_page()) ?
+      write_page_->len() : 0;
+  // Check to see if we can get the reservation before changing the state of the stream.
+  if (!buffer_pool_client_->IncreaseReservationToFit(page_len
+          - write_reservation_to_restore - read_reservation_to_restore
+          - write_page_reservation_to_reclaim)) {
+    DCHECK(pinned_ || page_len > default_page_len_)
+        << "If the stream is unpinned, this should only fail for large pages";
+    CHECK_CONSISTENCY_FAST();
+    *got_reservation = false;
+    return Status::OK;
+  }
+  if (write_reservation_to_restore > 0) {
+    buffer_pool_client_->RestoreReservation(
+        &write_page_reservation_, write_reservation_to_restore);
+  }
+  if (read_reservation_to_restore > 0) {
+    buffer_pool_client_->RestoreReservation(
+        &read_page_reservation_, read_reservation_to_restore);
+  }
+  ResetWritePage();
+  //RETURN_IF_ERROR(NewWritePage(page_len));
+  status = NewWritePage(page_len);
+  if (UNLIKELY(!status.ok())) {
+     return status;
+  }
+  *got_reservation = true;
+  return Status::OK;
+}
+
+void BufferedTupleStream3::ResetWritePage() {
+  if (write_page_ == nullptr) return;
+  // Unpin the write page if we're reading in unpinned mode.
+  Page* prev_write_page = write_page_;
+  write_page_ = nullptr;
+  write_ptr_ = nullptr;
+  write_end_ptr_ = nullptr;
+
+  // May need to decrement pin count now that it's not the write page, depending on
+  // the stream's mode.
+  UnpinPageIfNeeded(prev_write_page, pinned_);
+}
+
+void BufferedTupleStream3::InvalidateWriteIterator() {
+  if (!has_write_iterator()) return;
+  ResetWritePage();
+  has_write_iterator_ = false;
+  // No more pages will be appended to stream - do not need any write reservation.
+  write_page_reservation_.Close();
+  // May not need a read reservation once the write iterator is invalidated.
+  if (NeedReadReservation(pinned_, num_pages_, has_read_iterator(),
+          read_page_ != pages_.end(), true, write_page_ != nullptr)
+      && !NeedReadReservation(pinned_, num_pages_, has_read_iterator(),
+             read_page_ != pages_.end(), false, false)) {
+    buffer_pool_client_->RestoreReservation(&read_page_reservation_, default_page_len_);
+  }
+}
+
+Status BufferedTupleStream3::NextReadPage() {
+  DCHECK(has_read_iterator());
+  DCHECK(!closed_);
+  CHECK_CONSISTENCY_FAST();
+
+  if (read_page_ == pages_.end()) {
+    // No rows read yet - start reading at first page. If the stream is unpinned, we can
+    // use the reservation saved in PrepareForReadWrite() to pin the first page.
+    read_page_ = pages_.begin();
+    if (NeedReadReservation(pinned_, num_pages_, true, false)
+        && !NeedReadReservation(pinned_, num_pages_, true, true)) {
+      buffer_pool_client_->RestoreReservation(&read_page_reservation_, default_page_len_);
+    }
+  } else if (delete_on_read_) {
+    DCHECK(read_page_ == pages_.begin()) << read_page_->DebugString() << " "
+                                         << DebugString();
+    DCHECK_NE(&*read_page_, write_page_);
+    bytes_pinned_ -= pages_.front().len();
+    buffer_pool_->DestroyPage(buffer_pool_client_, &pages_.front().handle);
+    pages_.pop_front();
+    --num_pages_;
+    read_page_ = pages_.begin();
+  } else {
+    // Unpin pages after reading them if needed.
+    Page* prev_read_page = &*read_page_;
+    ++read_page_;
+    UnpinPageIfNeeded(prev_read_page, pinned_);
+  }
+
+  if (read_page_ == pages_.end()) {
+    CHECK_CONSISTENCY_FULL();
+    return Status::OK;
+  }
+
+  if (!pinned_ && read_page_->len() > default_page_len_
+      && buffer_pool_client_->GetUnusedReservation() < read_page_->len()) {
+    // If we are iterating over an unpinned stream and encounter a page that is larger
+    // than the default page length, then unpinning the previous page may not have
+    // freed up enough reservation to pin the next one. The client is responsible for
+    // ensuring the reservation is available, so this indicates a bug.
+    std::stringstream err_stream;
+    err_stream << "Internal error: couldn't pin large page of " << read_page_->len()
+               << " bytes, client only had " << buffer_pool_client_->GetUnusedReservation()
+               << " bytes of unused reservation:" << buffer_pool_client_->DebugString() << "\n";
+    return Status(err_stream.str());
+  }
+  // Ensure the next page is pinned for reading. By this point we should have enough
+  // reservation to pin the page. If the stream is pinned, the page is already pinned.
+  // If the stream is unpinned, we freed up enough memory for a default-sized page by
+  // deleting or unpinning the previous page and ensured that, if the page was larger,
+  // that the reservation is available with the above check.
+  RETURN_IF_ERROR(PinPageIfNeeded(&*read_page_, pinned_));
+
+  // This waits for the pin to complete if the page was unpinned earlier.
+  const BufferHandle* read_buffer;
+  RETURN_IF_ERROR(read_page_->GetBuffer(&read_buffer));
+
+  read_page_rows_returned_ = 0;
+  read_ptr_ = read_buffer->data();
+  read_end_ptr_ = read_ptr_ + read_buffer->len();
+
+  // We may need to save reservation for the write page in the case when the write page
+  // became a read/write page.
+  if (!NeedWriteReservation(pinned_, num_pages_, has_write_iterator(),
+             write_page_ != nullptr, false)
+      && NeedWriteReservation(pinned_, num_pages_, has_write_iterator(),
+             write_page_ != nullptr, has_read_write_page())) {
+    buffer_pool_client_->SaveReservation(&write_page_reservation_, default_page_len_);
+  }
+  CHECK_CONSISTENCY_FAST();
+  return Status::OK;
+}
+
+void BufferedTupleStream3::InvalidateReadIterator() {
+  if (read_page_ != pages_.end()) {
+    // Unpin the write page if we're reading in unpinned mode.
+    Page* prev_read_page = &*read_page_;
+    read_page_ = pages_.end();
+    read_ptr_ = nullptr;
+    read_end_ptr_ = nullptr;
+
+    // May need to decrement pin count after destroying read iterator.
+    UnpinPageIfNeeded(prev_read_page, pinned_);
+  }
+  has_read_iterator_ = false;
+  if (read_page_reservation_.GetReservation() > 0) {
+    buffer_pool_client_->RestoreReservation(&read_page_reservation_, default_page_len_);
+  }
+  // It is safe to re-read a delete-on-read stream if no rows were read and no pages
+  // were therefore deleted.
+  if (rows_returned_ == 0) delete_on_read_ = false;
+}
+
+Status BufferedTupleStream3::PrepareForRead(bool delete_on_read, bool* got_reservation) {
+  CHECK_CONSISTENCY_FULL();
+  InvalidateWriteIterator();
+  InvalidateReadIterator();
+  // If already pinned, no additional pin is needed (see ExpectedPinCount()).
+  *got_reservation = pinned_ || pages_.empty()
+      || buffer_pool_client_->IncreaseReservationToFit(default_page_len_);
+  if (!*got_reservation) return Status::OK;
+  return PrepareForReadInternal(delete_on_read);
+}
+
+Status BufferedTupleStream3::PrepareForReadInternal(bool delete_on_read) {
+  DCHECK(!closed_);
+  DCHECK(!delete_on_read_);
+  DCHECK(!has_read_iterator());
+
+  has_read_iterator_ = true;
+  if (pages_.empty()) {
+    // No rows to return, or a the first read/write page has not yet been allocated.
+    read_page_ = pages_.end();
+    read_ptr_ = nullptr;
+    read_end_ptr_ = nullptr;
+  } else {
+    // Eagerly pin the first page in the stream.
+    read_page_ = pages_.begin();
+    // Check if we need to increment the pin count of the read page.
+    RETURN_IF_ERROR(PinPageIfNeeded(&*read_page_, pinned_));
+    DCHECK(read_page_->is_pinned());
+
+    // This waits for the pin to complete if the page was unpinned earlier.
+    const BufferHandle* read_buffer;
+    RETURN_IF_ERROR(read_page_->GetBuffer(&read_buffer));
+    read_ptr_ = read_buffer->data();
+    read_end_ptr_ = read_ptr_ + read_buffer->len();
+  }
+  read_page_rows_returned_ = 0;
+  rows_returned_ = 0;
+  delete_on_read_ = delete_on_read;
+  CHECK_CONSISTENCY_FULL();
+  return Status::OK;
+}
+
+Status BufferedTupleStream3::PinStream(bool* pinned) {
+  DCHECK(!closed_);
+  CHECK_CONSISTENCY_FULL();
+  if (pinned_) {
+    *pinned = true;
+    return Status::OK;
+  }
+  *pinned = false;
+  // First, make sure we have the reservation to pin all the pages for reading.
+  int64_t bytes_to_pin = 0;
+  for (Page& page : pages_) {
+    bytes_to_pin += (ExpectedPinCount(true, &page) - page.pin_count()) * page.len();
+  }
+
+  // Check if we have some reservation to restore.
+  bool restore_write_reservation =
+      NeedWriteReservation(false) && !NeedWriteReservation(true);
+  bool restore_read_reservation =
+      NeedReadReservation(false) && !NeedReadReservation(true);
+  int64_t increase_needed = bytes_to_pin
+      - (restore_write_reservation ? default_page_len_ : 0)
+      - (restore_read_reservation ? default_page_len_ : 0);
+  bool reservation_granted =
+      buffer_pool_client_->IncreaseReservationToFit(increase_needed);
+  if (!reservation_granted) return Status::OK;
+
+  // If there is no current write page we should have some saved reservation to use.
+  // Only continue saving it if the stream is empty and need it to pin the first page.
+  if (restore_write_reservation) {
+    buffer_pool_client_->RestoreReservation(&write_page_reservation_, default_page_len_);
+  }
+  if (restore_read_reservation) {
+    buffer_pool_client_->RestoreReservation(&read_page_reservation_, default_page_len_);
+  }
+
+  // At this point success is guaranteed - go through to pin the pages we need to pin.
+  // If the page data was evicted from memory, the read I/O can happen in parallel
+  // because we defer calling GetBuffer() until NextReadPage().
+  for (Page& page : pages_) RETURN_IF_ERROR(PinPageIfNeeded(&page, true));
+
+  pinned_ = true;
+  *pinned = true;
+  CHECK_CONSISTENCY_FULL();
+  return Status::OK;
+}
+/*
+void BufferedTupleStream3::UnpinStream(UnpinMode mode) {
+  CHECK_CONSISTENCY_FULL();
+  DCHECK(!closed_);
+  if (mode == UNPIN_ALL) {
+    // Invalidate the iterators so they don't keep pages pinned.
+    InvalidateWriteIterator();
+    InvalidateReadIterator();
+  }
+
+  if (pinned_) {
+    CHECK_CONSISTENCY_FULL();
+    // If the stream was pinned, there may be some remaining pinned pages that should
+    // be unpinned at this point.
+    for (Page& page : pages_) UnpinPageIfNeeded(&page, false);
+
+    // Check to see if we need to save some of the reservation we freed up.
+    if (!NeedWriteReservation(true) && NeedWriteReservation(false)) {
+      buffer_pool_client_->SaveReservation(&write_page_reservation_, default_page_len_);
+    }
+    if (!NeedReadReservation(true) && NeedReadReservation(false)) {
+      buffer_pool_client_->SaveReservation(&read_page_reservation_, default_page_len_);
+    }
+    pinned_ = false;
+  }
+  CHECK_CONSISTENCY_FULL();
+}
+*/
+Status BufferedTupleStream3::GetRows(
+    MemTracker* tracker, scoped_ptr* batch, bool* got_rows) {
+  if (num_rows() > numeric_limits::max()) {
+    // RowBatch::num_rows_ is a 32-bit int, avoid an overflow.
+    return Status(Substitute("Trying to read $0 rows into in-memory batch failed. Limit "
+                             "is $1",
+        num_rows(), numeric_limits::max()));
+  }
+  RETURN_IF_ERROR(PinStream(got_rows));
+  if (!*got_rows) return Status::OK;
+  bool got_reservation;
+  RETURN_IF_ERROR(PrepareForRead(false, &got_reservation));
+  DCHECK(got_reservation) << "Stream was pinned";
+  
+  // TODO chenhao 
+  // capacity in RowBatch use int, but _num_rows is int64_t
+  // it may be precision loss
+  batch->reset(new RowBatch(*desc_, num_rows(), tracker));
+  bool eos = false;
+  // Loop until GetNext fills the entire batch. Each call can stop at page
+  // boundaries. We generally want it to stop, so that pages can be freed
+  // as we read. It is safe in this case because we pin the entire stream.
+  while (!eos) {
+    RETURN_IF_ERROR(GetNext(batch->get(), &eos));
+  }
+  return Status::OK;
+}
+
+Status BufferedTupleStream3::GetNext(RowBatch* batch, bool* eos) {
+  return GetNextInternal(batch, eos, nullptr);
+}
+
+Status BufferedTupleStream3::GetNext(
+    RowBatch* batch, bool* eos, vector* flat_rows) {
+  return GetNextInternal(batch, eos, flat_rows);
+}
+
+template 
+Status BufferedTupleStream3::GetNextInternal(
+    RowBatch* batch, bool* eos, vector* flat_rows) {
+  if (has_nullable_tuple_) {
+    return GetNextInternal(batch, eos, flat_rows);
+  } else {
+    return GetNextInternal(batch, eos, flat_rows);
+  }
+}
+
+template 
+Status BufferedTupleStream3::GetNextInternal(
+    RowBatch* batch, bool* eos, vector* flat_rows) {
+  DCHECK(!closed_);
+  DCHECK(batch->row_desc().equals(*desc_));
+  DCHECK(is_pinned() || !FILL_FLAT_ROWS)
+      << "FlatRowPtrs are only valid for pinned streams";
+  *eos = (rows_returned_ == num_rows_);
+  if (*eos) return Status::OK;
+
+  if (UNLIKELY(read_page_ == pages_.end()
+          || read_page_rows_returned_ == read_page_->num_rows)) {
+    // Get the next page in the stream (or the first page if read_page_ was not yet
+    // initialized.) We need to do this at the beginning of the GetNext() call to ensure
+    // the buffer management semantics. NextReadPage() may unpin or delete the buffer
+    // backing the rows returned from the *previous* call to GetNext().
+    RETURN_IF_ERROR(NextReadPage());
+  }
+
+  DCHECK(has_read_iterator());
+  DCHECK(read_page_ != pages_.end());
+  DCHECK(read_page_->is_pinned()) << DebugString();
+  DCHECK_GE(read_page_rows_returned_, 0);
+
+  int rows_left_in_page = read_page_->num_rows - read_page_rows_returned_;
+  int rows_to_fill = std::min(batch->capacity() - batch->num_rows(), rows_left_in_page);
+  DCHECK_GE(rows_to_fill, 1);
+  uint8_t* tuple_row_mem = reinterpret_cast(batch->get_row(batch->num_rows()));
+
+  // Produce tuple rows from the current page and the corresponding position on the
+  // null tuple indicator.
+  if (FILL_FLAT_ROWS) {
+    DCHECK(flat_rows != nullptr);
+    DCHECK(!delete_on_read_);
+    DCHECK_EQ(batch->num_rows(), 0);
+    flat_rows->clear();
+    flat_rows->reserve(rows_to_fill);
+  }
+
+  const uint64_t tuples_per_row = desc_->tuple_descriptors().size();
+  // Start reading from the current position in 'read_page_'.
+  for (int i = 0; i < rows_to_fill; ++i) {
+    if (FILL_FLAT_ROWS) {
+      flat_rows->push_back(read_ptr_);
+      DCHECK_EQ(flat_rows->size(), i + 1);
+    }
+    // Copy the row into the output batch.
+    TupleRow* output_row = reinterpret_cast(tuple_row_mem);
+    tuple_row_mem += sizeof(Tuple*) * tuples_per_row;
+    UnflattenTupleRow(&read_ptr_, output_row);
+
+    // Update string slot ptrs, skipping external strings.
+    for (int j = 0; j < inlined_string_slots_.size(); ++j) {
+      Tuple* tuple = output_row->get_tuple(inlined_string_slots_[j].first);
+      if (HAS_NULLABLE_TUPLE && tuple == nullptr) continue;
+      FixUpStringsForRead(inlined_string_slots_[j].second, tuple);
+    }
+/*
+    // Update collection slot ptrs, skipping external collections. We traverse the
+    // collection structure in the same order as it was written to the stream, allowing
+    // us to infer the data layout based on the length of collections and strings.
+    for (int j = 0; j < inlined_coll_slots_.size(); ++j) {
+      Tuple* tuple = output_row->get_tuple(inlined_coll_slots_[j].first);
+      if (HAS_NULLABLE_TUPLE && tuple == nullptr) continue;
+      FixUpCollectionsForRead(inlined_coll_slots_[j].second, tuple);
+    }
+*/
+  }
+
+  batch->commit_rows(rows_to_fill);
+  rows_returned_ += rows_to_fill;
+  read_page_rows_returned_ += rows_to_fill;
+  *eos = (rows_returned_ == num_rows_);
+  if (read_page_rows_returned_ == read_page_->num_rows && (!pinned_ || delete_on_read_)) {
+    // No more data in this page. The batch must be immediately returned up the operator
+    // tree and deep copied so that NextReadPage() can reuse the read page's buffer.
+    // TODO: IMPALA-4179 - instead attach the buffer and flush the resources.
+    batch->mark_needs_deep_copy();
+  }
+  if (FILL_FLAT_ROWS) DCHECK_EQ(flat_rows->size(), rows_to_fill);
+  DCHECK_LE(read_ptr_, read_end_ptr_);
+  return Status::OK;
+}
+
+void BufferedTupleStream3::FixUpStringsForRead(
+    const vector& string_slots, Tuple* tuple) {
+  DCHECK(tuple != nullptr);
+  for (const SlotDescriptor* slot_desc : string_slots) {
+    if (tuple->is_null(slot_desc->null_indicator_offset())) continue;
+
+    StringValue* sv = tuple->get_string_slot(slot_desc->tuple_offset());
+    DCHECK_LE(read_ptr_ + sv->len, read_end_ptr_);
+    sv->ptr = reinterpret_cast(read_ptr_);
+    read_ptr_ += sv->len;
+  }
+}
+/*
+void BufferedTupleStream3::FixUpCollectionsForRead(
+    const vector& collection_slots, Tuple* tuple) {
+  DCHECK(tuple != nullptr);
+  for (const SlotDescriptor* slot_desc : collection_slots) {
+    if (tuple->is_null(slot_desc->null_indicator_offset())) continue;
+
+    CollectionValue* cv = tuple->get_collection_slot(slot_desc->tuple_offset());
+    const TupleDescriptor& item_desc = *slot_desc->collection_item_descriptor();
+    int coll_byte_size = cv->num_tuples * item_desc.byte_size();
+    DCHECK_LE(read_ptr_ + coll_byte_size, read_end_ptr_);
+    cv->ptr = reinterpret_cast(read_ptr_);
+    read_ptr_ += coll_byte_size;
+
+    if (!item_desc.has_varlen_slots()) continue;
+    uint8_t* coll_data = cv->ptr;
+    for (int i = 0; i < cv->num_tuples; ++i) {
+      Tuple* item = reinterpret_cast(coll_data);
+      FixUpStringsForRead(item_desc.string_slots(), item);
+      FixUpCollectionsForRead(item_desc.collection_slots(), item);
+      coll_data += item_desc.byte_size();
+    }
+  }
+}
+*/
+int64_t BufferedTupleStream3::ComputeRowSize(TupleRow* row) const noexcept {
+  int64_t size = 0;
+  if (has_nullable_tuple_) {
+    size += NullIndicatorBytesPerRow();
+    for (int i = 0; i < fixed_tuple_sizes_.size(); ++i) {
+      if (row->get_tuple(i) != nullptr) size += fixed_tuple_sizes_[i];
+    }
+  } else {
+    for (int i = 0; i < fixed_tuple_sizes_.size(); ++i) {
+      size += fixed_tuple_sizes_[i];
+    }
+  }
+  for (int i = 0; i < inlined_string_slots_.size(); ++i) {
+    Tuple* tuple = row->get_tuple(inlined_string_slots_[i].first);
+    if (tuple == nullptr) continue;
+    const vector& slots = inlined_string_slots_[i].second;
+    for (auto it = slots.begin(); it != slots.end(); ++it) {
+      if (tuple->is_null((*it)->null_indicator_offset())) continue;
+      size += tuple->get_string_slot((*it)->tuple_offset())->len;
+    }
+  }
+
+/*
+  for (int i = 0; i < inlined_coll_slots_.size(); ++i) {
+    Tuple* tuple = row->get_tuple(inlined_coll_slots_[i].first);
+    if (tuple == nullptr) continue;
+    const vector& slots = inlined_coll_slots_[i].second;
+    for (auto it = slots.begin(); it != slots.end(); ++it) {
+      if (tuple->is_null((*it)->null_indicator_offset())) continue;
+      CollectionValue* cv = tuple->get_collection_slot((*it)->tuple_offset());
+      const TupleDescriptor& item_desc = *(*it)->collection_item_descriptor();
+      size += cv->num_tuples * item_desc.byte_size();
+
+      if (!item_desc.has_varlen_slots()) continue;
+      for (int j = 0; j < cv->num_tuples; ++j) {
+        Tuple* item = reinterpret_cast(&cv->ptr[j * item_desc.byte_size()]);
+        size += item->varlen_byte_size(item_desc);
+      }
+    }
+  }
+*/
+  return size;
+}
+
+bool BufferedTupleStream3::AddRowSlow(TupleRow* row, Status* status) noexcept {
+  // Use AddRowCustom*() to do the work of advancing the page.
+  int64_t row_size = ComputeRowSize(row);
+  uint8_t* data = AddRowCustomBeginSlow(row_size, status);
+  if (data == nullptr) return false;
+  bool success = DeepCopy(row, &data, data + row_size);
+  DCHECK(success);
+  DCHECK_EQ(data, write_ptr_);
+  AddRowCustomEnd(row_size);
+  return true;
+}
+
+uint8_t* BufferedTupleStream3::AddRowCustomBeginSlow(
+    int64_t size, Status* status) noexcept {
+  bool got_reservation;
+  *status = AdvanceWritePage(size, &got_reservation);
+  if (!status->ok() || !got_reservation) {
+      return nullptr;
+  }
+  // We have a large-enough page so now success is guaranteed.
+  uint8_t* result = AddRowCustomBegin(size, status);
+  DCHECK(result != nullptr);
+  return result;
+}
+
+void BufferedTupleStream3::AddLargeRowCustomEnd(int64_t size) noexcept {
+  DCHECK_GT(size, default_page_len_);
+  // Immediately unpin the large write page so that we're not using up extra reservation
+  // and so we don't append another row to the page.
+  ResetWritePage();
+  // Save some of the reservation we freed up so we can create the next write page when
+  // needed.
+  if (NeedWriteReservation()) {
+    buffer_pool_client_->SaveReservation(&write_page_reservation_, default_page_len_);
+  }
+  // The stream should be in a consistent state once the row is added.
+  CHECK_CONSISTENCY_FAST();
+}
+
+bool BufferedTupleStream3::AddRow(TupleRow* row, Status* status) noexcept {
+  DCHECK(!closed_);
+  DCHECK(has_write_iterator());
+  if (UNLIKELY(write_page_ == nullptr || !DeepCopy(row, &write_ptr_, write_end_ptr_))) {
+    return AddRowSlow(row, status);
+  }
+  ++num_rows_;
+  ++write_page_->num_rows;
+  return true;
+}
+
+bool BufferedTupleStream3::DeepCopy(
+    TupleRow* row, uint8_t** data, const uint8_t* data_end) noexcept {
+  return has_nullable_tuple_ ? DeepCopyInternal(row, data, data_end) :
+                               DeepCopyInternal(row, data, data_end);
+}
+
+// TODO: consider codegening this.
+// TODO: in case of duplicate tuples, this can redundantly serialize data.
+template 
+bool BufferedTupleStream3::DeepCopyInternal(
+    TupleRow* row, uint8_t** data, const uint8_t* data_end) noexcept {
+  uint8_t* pos = *data;
+  const uint64_t tuples_per_row = desc_->tuple_descriptors().size();
+  // Copy the not NULL fixed len tuples. For the NULL tuples just update the NULL tuple
+  // indicator.
+  if (HAS_NULLABLE_TUPLE) {
+    int null_indicator_bytes = NullIndicatorBytesPerRow();
+    if (UNLIKELY(pos + null_indicator_bytes > data_end)) return false;
+    uint8_t* null_indicators = pos;
+    pos += NullIndicatorBytesPerRow();
+    memset(null_indicators, 0, null_indicator_bytes);
+    for (int i = 0; i < tuples_per_row; ++i) {
+      uint8_t* null_word = null_indicators + (i >> 3);
+      const uint32_t null_pos = i & 7;
+      const int tuple_size = fixed_tuple_sizes_[i];
+      Tuple* t = row->get_tuple(i);
+      const uint8_t mask = 1 << (7 - null_pos);
+      if (t != nullptr) {
+        if (UNLIKELY(pos + tuple_size > data_end)) return false;
+        memcpy(pos, t, tuple_size);
+        pos += tuple_size;
+      } else {
+        *null_word |= mask;
+      }
+    }
+  } else {
+    // If we know that there are no nullable tuples no need to set the nullability flags.
+    for (int i = 0; i < tuples_per_row; ++i) {
+      const int tuple_size = fixed_tuple_sizes_[i];
+      if (UNLIKELY(pos + tuple_size > data_end)) return false;
+      Tuple* t = row->get_tuple(i);
+      // TODO: Once IMPALA-1306 (Avoid passing empty tuples of non-materialized slots)
+      // is delivered, the check below should become DCHECK(t != nullptr).
+      DCHECK(t != nullptr || tuple_size == 0);
+      memcpy(pos, t, tuple_size);
+      pos += tuple_size;
+    }
+  }
+
+  // Copy inlined string slots. Note: we do not need to convert the string ptrs to offsets
+  // on the write path, only on the read. The tuple data is immediately followed
+  // by the string data so only the len information is necessary.
+  for (int i = 0; i < inlined_string_slots_.size(); ++i) {
+    const Tuple* tuple = row->get_tuple(inlined_string_slots_[i].first);
+    if (HAS_NULLABLE_TUPLE && tuple == nullptr) continue;
+    if (UNLIKELY(!CopyStrings(tuple, inlined_string_slots_[i].second, &pos, data_end)))
+      return false;
+  }
+/*
+  // Copy inlined collection slots. We copy collection data in a well-defined order so
+  // we do not need to convert pointers to offsets on the write path.
+  for (int i = 0; i < inlined_coll_slots_.size(); ++i) {
+    const Tuple* tuple = row->get_tuple(inlined_coll_slots_[i].first);
+    if (HAS_NULLABLE_TUPLE && tuple == nullptr) continue;
+    if (UNLIKELY(!CopyCollections(tuple, inlined_coll_slots_[i].second, &pos, data_end)))
+      return false;
+  }
+*/
+  *data = pos;
+  return true;
+}
+
+bool BufferedTupleStream3::CopyStrings(const Tuple* tuple,
+    const vector& string_slots, uint8_t** data, const uint8_t* data_end) {
+  for (const SlotDescriptor* slot_desc : string_slots) {
+    if (tuple->is_null(slot_desc->null_indicator_offset())) continue;
+    const StringValue* sv = tuple->get_string_slot(slot_desc->tuple_offset());
+    if (LIKELY(sv->len > 0)) {
+      if (UNLIKELY(*data + sv->len > data_end)) return false;
+
+      memcpy(*data, sv->ptr, sv->len);
+      *data += sv->len;
+    }
+  }
+  return true;
+}
+/*
+bool BufferedTupleStream3::CopyCollections(const Tuple* tuple,
+    const vector& collection_slots, uint8_t** data, const uint8_t* data_end) {
+  for (const SlotDescriptor* slot_desc : collection_slots) {
+    if (tuple->is_null(slot_desc->null_indicator_offset())) continue;
+    const CollectionValue* cv = tuple->get_collection_slot(slot_desc->tuple_offset());
+    const TupleDescriptor& item_desc = *slot_desc->collection_item_descriptor();
+    if (LIKELY(cv->num_tuples > 0)) {
+      int coll_byte_size = cv->num_tuples * item_desc.byte_size();
+      if (UNLIKELY(*data + coll_byte_size > data_end)) return false;
+      uint8_t* coll_data = *data;
+      memcpy(coll_data, cv->ptr, coll_byte_size);
+      *data += coll_byte_size;
+
+      if (!item_desc.has_varlen_slots()) continue;
+      // Copy variable length data when present in collection items.
+      for (int i = 0; i < cv->num_tuples; ++i) {
+        const Tuple* item = reinterpret_cast(coll_data);
+        if (UNLIKELY(!CopyStrings(item, item_desc.string_slots(), data, data_end))) {
+          return false;
+        }
+        if (UNLIKELY(
+                !CopyCollections(item, item_desc.collection_slots(), data, data_end))) {
+          return false;
+        }
+        coll_data += item_desc.byte_size();
+      }
+    }
+  }
+  return true;
+}
+*/
+void BufferedTupleStream3::GetTupleRow(FlatRowPtr flat_row, TupleRow* row) const {
+  DCHECK(row != nullptr);
+  DCHECK(!closed_);
+  DCHECK(is_pinned());
+  DCHECK(!delete_on_read_);
+  uint8_t* data = flat_row;
+  return has_nullable_tuple_ ? UnflattenTupleRow(&data, row) :
+                               UnflattenTupleRow(&data, row);
+}
+
+template 
+void BufferedTupleStream3::UnflattenTupleRow(uint8_t** data, TupleRow* row) const {
+  const int tuples_per_row = desc_->tuple_descriptors().size();
+  uint8_t* ptr = *data;
+  if (has_nullable_tuple_) {
+    // Stitch together the tuples from the page and the NULL ones.
+    const uint8_t* null_indicators = ptr;
+    ptr += NullIndicatorBytesPerRow();
+    for (int i = 0; i < tuples_per_row; ++i) {
+      const uint8_t* null_word = null_indicators + (i >> 3);
+      const uint32_t null_pos = i & 7;
+      const bool is_not_null = ((*null_word & (1 << (7 - null_pos))) == 0);
+      row->set_tuple(
+          i, reinterpret_cast(reinterpret_cast(ptr) * is_not_null));
+      ptr += fixed_tuple_sizes_[i] * is_not_null;
+    }
+  } else {
+    for (int i = 0; i < tuples_per_row; ++i) {
+      row->set_tuple(i, reinterpret_cast(ptr));
+      ptr += fixed_tuple_sizes_[i];
+    }
+  }
+  *data = ptr;
+}
diff --git a/be/src/runtime/buffered_tuple_stream3.h b/be/src/runtime/buffered_tuple_stream3.h
new file mode 100644
index 0000000000..d1cc57d4a1
--- /dev/null
+++ b/be/src/runtime/buffered_tuple_stream3.h
@@ -0,0 +1,708 @@
+// Modifications copyright (C) 2017, Baidu.com, Inc.
+// Copyright 2017 The Apache Software Foundation
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef BDG_PALO_BE_RUNTIME_BUFFERED_TUPLE_STREAM_H
+#define BDG_PALO_BE_RUNTIME_BUFFERED_TUPLE_STREAM_H
+
+#include 
+#include 
+#include 
+#include 
+
+#include "common/global_types.h"
+#include "common/status.h"
+#include "gutil/macros.h"
+#include "runtime/bufferpool/buffer_pool.h"
+#include "runtime/row_batch.h"
+
+namespace palo {
+
+class MemTracker;
+class RuntimeState;
+class RowDescriptor;
+class SlotDescriptor;
+class Tuple;
+class TupleRow;
+
+/// Class that provides an abstraction for a stream of tuple rows backed by BufferPool
+/// Pages. Rows can be added to the stream and read back. Rows are returned in the order
+/// they are added.
+///
+/// The BufferedTupleStream3 is *not* thread safe from the caller's point of view.
+/// Different threads should not concurrently call methods of the same BufferedTupleStream3
+/// object.
+///
+/// Reading and writing the stream:
+/// The stream supports two modes of reading/writing, depending on whether
+/// PrepareForWrite() is called to initialize a write iterator only or
+/// PrepareForReadWrite() is called to initialize both read and write iterators to enable
+/// interleaved reads and writes.
+///
+/// To use write-only mode, PrepareForWrite() is called once and AddRow()/AddRowCustom*()
+/// are called repeatedly to initialize then advance a write iterator through the stream.
+/// Once the stream is fully written, it can be read back by calling PrepareForRead()
+/// then GetNext() repeatedly to advance a read iterator through the stream, or by
+/// calling GetRows() to get all of the rows at once.
+///
+/// To use read/write mode, PrepareForReadWrite() is called once to initialize the read
+/// and write iterators. AddRow()/AddRowCustom*() then advance a write iterator through
+/// the stream, and GetNext() advances a trailing read iterator through the stream.
+///
+/// Buffer management:
+/// The tuple stream is backed by a sequence of BufferPool Pages. The tuple stream uses
+/// the client's reservation to pin pages in memory. It will automatically try to
+/// increase the client's reservation whenever it needs to do so to make progress.
+///
+/// Normally pages are all of the same default page length, but larger pages up to the
+/// max page length are used if needed to store rows that are too large for a
+/// default-length page.
+///
+/// The stream has both pinned and unpinned modes. In the pinned mode all pages are
+/// pinned for reading. The pinned mode avoids I/O by keeping all pages pinned in memory
+/// and allows clients to save pointers to rows in the stream and randomly access them.
+/// E.g. hash tables can be backed by a BufferedTupleStream3. In the unpinned mode, only
+/// pages currently being read and written are pinned and other pages are unpinned and
+/// therefore do not use the client's reservation and can be spilled to disk. The stream
+/// always holds onto a default page's worth of reservation for the read and write
+/// iterators (i.e. two page's worth if the stream is in read/write mode), even if that
+/// many pages are not currently pinned. This means that UnpinStream() always succeeds,
+/// and moving to the next default-length write page or read page on an unpinned stream
+/// does not require additional reservation. This is implemented by saving reservations
+/// in SubReservations.
+///
+/// To read or write a row larger than the default page size to/from an unpinned stream,
+/// the client must have max_page_len - default_page_len unused reservation. Writing a
+/// large row to an unpinned stream only uses the reservation for the duration of the
+/// AddRow()/AddRowCustom*() call. Reading a large row from an unpinned stream uses the
+/// reservation until the next call to GetNext(). E.g. to partition a single unpinned
+/// stream into n unpinned streams, the reservation needed is (n - 1) *
+/// default_page_len + 2 * max_page_len: one large read buffer and one large write
+/// buffer is needed to keep the row being processed in-memory, but only default-sized
+/// buffers are needed for the other streams being written.
+///
+/// The tuple stream also supports a 'delete_on_read' mode, enabled by passing a flag
+/// to PrepareForRead() which deletes the stream's pages as it does a final read
+/// pass over the stream.
+///
+/// TODO: IMPALA-4179: the buffer management can be simplified once we can attach
+/// buffers to RowBatches.
+///
+/// Page layout:
+/// Rows are stored back to back starting at the first byte of each page's buffer, with
+/// no interleaving of data from different rows. There is no padding or alignment
+/// between rows. Rows larger than the default page length are stored on their own
+/// page.
+///
+/// Tuple row layout:
+/// If the stream's tuples are nullable (i.e. has_nullable_tuple_ is true), there is a
+/// bitstring at the start of each row with null indicators for all tuples in each row
+/// (including non-nullable tuples). The bitstring occupies ceil(num_tuples_per_row / 8)
+/// bytes. A 1 indicates the tuple is null.
+///
+/// The fixed length parts of the row's tuples are stored first, followed by var len data
+/// for inlined_string_slots_ and inlined_coll_slots_. Other "external" var len slots can
+/// point to var len data outside the stream. When reading the stream, the length of each
+/// row's var len data in the stream must be computed to find the next row's start.
+///
+/// The tuple stream supports reading from the stream into RowBatches without copying
+/// out any data: the RowBatches' Tuple pointers will point directly into the stream's
+/// pages' buffers. The fixed length parts follow Impala's internal tuple format, so for
+/// the tuple to be valid, we only need to update pointers to point to the var len data
+/// in the stream. These pointers need to be updated by the stream because a spilled
+/// page's data may be relocated to a different buffer. The pointers are updated lazily
+/// upon reading the stream via GetNext() or GetRows().
+///
+/// Example layout for a row with two non-nullable tuples ((1, "hello"), (2, "world"))
+/// with all var len data stored in the stream:
+///  <---- tuple 1 -----> <------ tuple 2 ------> <- var len -> <- next row ...
+/// +--------+-----------+-----------+-----------+-------------+
+/// | IntVal | StringVal | BigIntVal | StringVal |             | ...
+/// +--------+-----------+-----------+-----------++------------+
+/// | val: 1 | len: 5    | val: 2    | len: 5    | helloworld  | ...
+/// |        | ptr: 0x.. |           | ptr: 0x.. |             | ...
+/// +--------+-----------+-----------+-----------+-------------+
+///  <--4b--> <---12b---> <----8b---> <---12b---> <----10b---->
+///
+/// Example layout for a row with the second tuple nullable ((1, "hello"), NULL)
+/// with all var len data stored in the stream:
+/// <- null tuple bitstring -> <---- tuple 1 -----> <- var len -> <- next row ...
+/// +-------------------------+--------+-----------+------------+
+/// |                         | IntVal | StringVal |            | ...
+/// +-------------------------+--------+-----------+------------+
+/// | 0000 0010               | val: 1 | len: 5    | hello      | ...
+/// |                         |        | ptr: 0x.. |            | ...
+/// +-------------------------+--------+-----------+------------+
+///  <---------1b------------> <--4b--> <---12b---> <----5b---->
+///
+/// Example layout for a row with a single non-nullable tuple (("hello", "world")) with
+/// the second string slot stored externally to the stream:
+///  <------ tuple 1 ------> <- var len ->  <- next row ...
+/// +-----------+-----------+-------------+
+/// | StringVal | StringVal |             | ...
+/// +-----------+-----------+-------------+
+/// | len: 5    | len: 5    |  hello      | ...
+/// | ptr: 0x.. | ptr: 0x.. |             | ...
+/// +-----------+-----------+-------------+
+///  <---12b---> <---12b---> <-----5b---->
+///
+/// The behavior of reads and writes is as follows:
+/// Read:
+///   1. Unpinned: Only a single read page is pinned at a time. This means that only
+///     enough reservation to pin a single page is needed to read the stream, regardless
+///     of the stream's size. Each page is deleted or unpinned (if delete on read is true
+///     or false respectively) before advancing to the next page.
+///   2. Pinned: All pages in the stream are pinned so do not need to be pinned or
+///     unpinned when reading from the stream. If delete on read is true, pages are
+///     deleted after being read. If the stream was previously unpinned, the page's data
+///     may not yet be in memory - reading from the stream can block on I/O or fail with
+///     an I/O error.
+/// Write:
+///   1. Unpinned: Unpin pages as they fill up. This means that only a enough reservation
+///     to pin a single write page is required to write to the stream, regardless of the
+///     stream's size.
+///   2. Pinned: Pages are left pinned. If the next page in the stream cannot be pinned
+///     because the client's reservation is insufficient (and could not be increased by
+///     the stream), the read call will fail and the client can either unpin the stream
+///     or free up other memory before retrying.
+///
+/// Memory lifetime of rows read from stream:
+/// If the stream is pinned and delete on read is false, it is valid to access any tuples
+/// returned via GetNext() or GetRows() until the stream is unpinned. If the stream is
+/// unpinned or delete on read is true, then the batch returned from GetNext() may have
+/// the needs_deep_copy flag set, which means that any tuple memory returned so far from
+/// the stream may be freed on the next call to GetNext().
+/// TODO: IMPALA-4179, instead of needs_deep_copy, attach the pages' buffers to the batch.
+///
+/// Manual construction of rows with AddRowCustomBegin()/AddRowCustomEnd():
+/// The BufferedTupleStream3 supports allocation of uninitialized rows with
+/// AddRowCustom*(). AddRowCustomBegin() is called instead of AddRow() if the client wants
+/// to manually construct a row. The caller of AddRowCustomBegin() is responsible for
+/// writing the row with exactly the layout described above then calling
+/// AddRowCustomEnd() when done.
+///
+/// If a caller constructs a tuple in this way, the caller can set the pointers and they
+/// will not be modified until the stream is read via GetNext() or GetRows().
+/// TODO: IMPALA-5007: try to remove AddRowCustom*() by unifying with AddRow().
+///
+/// TODO: we need to be able to do read ahead for pages. We need some way to indicate a
+/// page will need to be pinned soon.
+class BufferedTupleStream3 {
+ public:
+  /// A pointer to the start of a flattened TupleRow in the stream.
+  typedef uint8_t* FlatRowPtr;
+
+  /// row_desc: description of rows stored in the stream. This is the desc for rows
+  /// that are added and the rows being returned.
+  /// page_len: the size of pages to use in the stream
+  /// ext_varlen_slots: set of varlen slots with data stored externally to the stream
+  BufferedTupleStream3(RuntimeState* state, const RowDescriptor* row_desc,
+      BufferPool::ClientHandle* buffer_pool_client, int64_t default_page_len,
+      int64_t max_page_len,
+      const std::set& ext_varlen_slots = std::set());
+
+  virtual ~BufferedTupleStream3();
+
+  /// Initializes the tuple stream object on behalf of node 'node_id'. Must be called
+  /// once before any of the other APIs.
+  /// If 'pinned' is true, the tuple stream starts off pinned, otherwise it is unpinned.
+  /// 'node_id' is only used for error reporting.
+  Status Init(int node_id, bool pinned) WARN_UNUSED_RESULT;
+
+  /// Prepares the stream for writing by saving enough reservation for a default-size
+  /// write page. Tries to increase reservation if there is not enough unused reservation
+  /// for a page. Called after Init() and before the first AddRow() or
+  /// AddRowCustomBegin() call.
+  /// 'got_reservation': set to true if there was enough reservation to initialize the
+  ///     first write page and false if there was not enough reservation and no other
+  ///     error was encountered. Undefined if an error status is returned.
+  Status PrepareForWrite(bool* got_reservation) WARN_UNUSED_RESULT;
+
+  /// Prepares the stream for interleaved reads and writes by saving enough reservation
+  /// for default-sized read and write pages. Called after Init() and before the first
+  /// AddRow() or AddRowCustomBegin() call.
+  /// 'delete_on_read': Pages are deleted after they are read.
+  /// 'got_reservation': set to true if there was enough reservation to initialize the
+  ///     read and write pages and false if there was not enough reservation and no other
+  ///     error was encountered. Undefined if an error status is returned.
+  Status PrepareForReadWrite(
+      bool delete_on_read, bool* got_reservation) WARN_UNUSED_RESULT;
+
+  /// Prepares the stream for reading, invalidating the write iterator (if there is one).
+  /// Therefore must be called after the last AddRow() or AddRowCustomEnd() and before
+  /// GetNext(). PrepareForRead() can be called multiple times to do multiple read passes
+  /// over the stream, unless rows were read from the stream after PrepareForRead() or
+  /// PrepareForReadWrite() was called with delete_on_read = true.
+  /// 'delete_on_read': Pages are deleted after they are read.
+  /// 'got_reservation': set to true if there was enough reservation to initialize the
+  ///     first read page and false if there was not enough reservation and no other
+  ///     error was encountered. Undefined if an error status is returned.
+  Status PrepareForRead(bool delete_on_read, bool* got_reservation) WARN_UNUSED_RESULT;
+
+  /// Adds a single row to the stream. There are three possible outcomes:
+  /// a) The append succeeds. True is returned.
+  /// b) The append fails because the unused reservation was not sufficient to add
+  ///   a new page to the stream large enough to fit 'row' and the stream could not
+  ///   increase the reservation to get enough unused reservation. Returns false and
+  ///   sets 'status' to OK. The append can be retried after freeing up memory or
+  ///   unpinning the stream.
+  /// c) The append fails with a runtime error. Returns false and sets 'status' to an
+  ///   error.
+  /// d) The append fails becase the row is too large to fit in a page of a stream.
+  ///   Returns false and sets 'status' to an error.
+  ///
+  /// Unpinned streams can only encounter case b) when appending a row larger than
+  /// the default page size and the reservation could not be increased sufficiently.
+  /// Otherwise enough memory is automatically freed up by unpinning the current write
+  /// page.
+  ///
+  /// BufferedTupleStream3 will do a deep copy of the memory in the row. After AddRow()
+  /// returns an error, it should not be called again.
+  bool AddRow(TupleRow* row, Status* status) noexcept WARN_UNUSED_RESULT;
+
+  /// Allocates space to store a row of 'size' bytes (including fixed and variable length
+  /// data). If successful, returns a pointer to the allocated row. The caller then must
+  /// writes valid data to the row and call AddRowCustomEnd().
+  ///
+  /// If unsuccessful, returns nullptr. The failure modes are the same as described in the
+  /// AddRow() comment.
+  ALWAYS_INLINE uint8_t* AddRowCustomBegin(int64_t size, Status* status);
+
+  /// Called after AddRowCustomBegin() when done writing the row. Only should be called
+  /// if AddRowCustomBegin() succeeded. See the AddRowCustomBegin() comment for
+  /// explanation.
+  /// 'size': the size passed into AddRowCustomBegin().
+  void AddRowCustomEnd(int64_t size);
+
+  /// Unflattens 'flat_row' into a regular TupleRow 'row'. Only valid to call if the
+  /// stream is pinned. The row must have been allocated with the stream's row desc.
+  /// The returned 'row' is backed by memory from the stream so is only valid as long
+  /// as the stream is pinned.
+  void GetTupleRow(FlatRowPtr flat_row, TupleRow* row) const;
+
+  /// Pins all pages in this stream and switches to pinned mode. Has no effect if the
+  /// stream is already pinned.
+  /// If the current unused reservation is not sufficient to pin the stream in memory,
+  /// this will try to increase the reservation. If that fails, 'pinned' is set to false
+  /// and the stream is left unpinned. Otherwise 'pinned' is set to true.
+  Status PinStream(bool* pinned) WARN_UNUSED_RESULT;
+
+  /// Modes for UnpinStream().
+  enum UnpinMode {
+    /// All pages in the stream are unpinned and the read/write positions in the stream
+    /// are reset. No more rows can be written to the stream after this. The stream can
+    /// be re-read from the beginning by calling PrepareForRead().
+    UNPIN_ALL,
+    /// All pages are unpinned aside from the current read and write pages (if any),
+    /// which is left in the same state. The unpinned stream can continue being read
+    /// or written from the current read or write positions.
+    UNPIN_ALL_EXCEPT_CURRENT,
+  };
+
+  /// Unpins stream with the given 'mode' as described above.
+  void UnpinStream(UnpinMode mode);
+
+  /// Get the next batch of output rows, which are backed by the stream's memory.
+  /// If the stream is unpinned or 'delete_on_read' is true, the 'needs_deep_copy'
+  /// flag may be set on 'batch' to signal that memory will be freed on the next
+  /// call to GetNext() and that the caller should copy out any data it needs from
+  /// rows in 'batch' or in previous batches returned from GetNext().
+  ///
+  /// If the stream is pinned and 'delete_on_read' is false, the memory backing the
+  /// rows will remain valid until the stream is unpinned, destroyed, etc.
+  /// TODO: IMPALA-4179: update when we simplify the memory transfer model.
+  Status GetNext(RowBatch* batch, bool* eos) WARN_UNUSED_RESULT;
+
+  /// Same as above, but populate 'flat_rows' with a pointer to the flat version of
+  /// each returned row in the pinned stream. The pointers in 'flat_rows' are only
+  /// valid as long as the stream remains pinned.
+  Status GetNext(
+      RowBatch* batch, bool* eos, std::vector* flat_rows) WARN_UNUSED_RESULT;
+
+  /// Returns all the rows in the stream in batch. This pins the entire stream in the
+  /// process. If the current unused reservation is not sufficient to pin the stream in
+  /// memory, this will try to increase the reservation. If that fails, 'got_rows' is set
+  /// to false.
+  Status GetRows(MemTracker* tracker, boost::scoped_ptr* batch,
+      bool* got_rows) WARN_UNUSED_RESULT;
+
+  /// Must be called once at the end to cleanup all resources. If 'batch' is non-NULL,
+  /// attaches buffers from pinned pages that rows returned from GetNext() may reference.
+  /// Otherwise deletes all pages. Does nothing if the stream was already closed. The
+  /// 'flush' mode is forwarded to RowBatch::AddBuffer() when attaching buffers.
+  void Close(RowBatch* batch, RowBatch::FlushMode flush);
+
+  /// Number of rows in the stream.
+  int64_t num_rows() const { return num_rows_; }
+
+  /// Number of rows returned via GetNext().
+  int64_t rows_returned() const { return rows_returned_; }
+
+  /// Returns the byte size necessary to store the entire stream in memory.
+  int64_t byte_size() const { return total_byte_size_; }
+
+  /// Returns the number of bytes currently pinned in memory by the stream.
+  /// If ignore_current is true, the write_page_ memory is not included.
+  int64_t BytesPinned(bool ignore_current) const {
+    if (ignore_current && write_page_ != nullptr && write_page_->is_pinned()) {
+      return bytes_pinned_ - write_page_->len();
+    }
+    return bytes_pinned_;
+  }
+
+  bool is_closed() const { return closed_; }
+  bool is_pinned() const { return pinned_; }
+  bool has_read_iterator() const { return has_read_iterator_; }
+  bool has_write_iterator() const { return has_write_iterator_; }
+
+  std::string DebugString() const;
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(BufferedTupleStream3);
+  friend class ArrayTupleStreamTest_TestArrayDeepCopy_Test;
+  friend class ArrayTupleStreamTest_TestComputeRowSize_Test;
+  friend class MultiNullableTupleStreamTest_TestComputeRowSize_Test;
+  friend class SimpleTupleStreamTest_TestGetRowsOverflow_Test;
+
+  /// Wrapper around BufferPool::PageHandle that tracks additional info about the page.
+  struct Page {
+    Page() : num_rows(0), retrieved_buffer(true) {}
+
+    inline int len() const { return handle.len(); }
+    inline bool is_pinned() const { return handle.is_pinned(); }
+    inline int pin_count() const { return handle.pin_count(); }
+    Status GetBuffer(const BufferPool::BufferHandle** buffer) {
+      RETURN_IF_ERROR(handle.GetBuffer(buffer));
+      retrieved_buffer = true;
+      return Status::OK;
+    }
+    std::string DebugString() const;
+
+    BufferPool::PageHandle handle;
+
+    /// Number of rows written to the page.
+    int num_rows;
+
+    /// Whether we called GetBuffer() on the page since it was last pinned. This means
+    /// that GetBuffer() and ExtractBuffer() cannot fail and that GetNext() may have
+    /// returned rows referencing the page's buffer.
+    bool retrieved_buffer;
+  };
+
+  /// Runtime state instance used to check for cancellation. Not owned.
+  RuntimeState* const state_;
+
+  /// Description of rows stored in the stream.
+  const RowDescriptor* desc_;
+
+  /// Plan node ID, used for error reporting.
+  int node_id_;
+
+  /// The size of the fixed length portion for each tuple in the row.
+  std::vector fixed_tuple_sizes_;
+
+  /// Vectors of all the strings slots that have their varlen data stored in stream
+  /// grouped by tuple_idx.
+  std::vector>> inlined_string_slots_;
+
+  /// Vectors of all the collection slots that have their varlen data stored in the
+  /// stream, grouped by tuple_idx.
+  // std::vector>> inlined_coll_slots_;
+
+  /// Buffer pool and client used to allocate, pin and release pages. Not owned.
+  BufferPool* buffer_pool_;
+  BufferPool::ClientHandle* buffer_pool_client_;
+
+  /// List of pages in the stream.
+  /// Empty iff one of two cases applies:
+  /// * before the first row has been added with AddRow() or AddRowCustom().
+  /// * after the stream has been destructively read in 'delete_on_read' mode
+  std::list pages_;
+  // IMPALA-5629: avoid O(n) list.size() call by explicitly tracking the number of pages.
+  // TODO: remove when we switch to GCC5+, where list.size() is O(1). See GCC bug #49561.
+  int64_t num_pages_;
+
+  /// Total size of pages_, including any pages already deleted in 'delete_on_read'
+  /// mode.
+  int64_t total_byte_size_;
+
+  /// True if there is currently an active read iterator for the stream.
+  bool has_read_iterator_;
+
+  /// The current page being read. When no read iterator is active, equal to list.end().
+  /// When a read iterator is active, either points to the current read page, or equals
+  /// list.end() if no rows have yet been read.  GetNext() does not advance this past
+  /// the end of the stream, so upon eos 'read_page_' points to the last page and
+  /// rows_returned_ == num_rows_. Always pinned, unless a Pin() call failed and an error
+  /// status was returned.
+  std::list::iterator read_page_;
+
+  /// Saved reservation for read iterator. 'default_page_len_' reservation is saved if
+  /// there is a read iterator, no pinned read page, and the possibility that the read
+  /// iterator will advance to a valid page.
+  BufferPool::SubReservation read_page_reservation_;
+
+  /// Number of rows returned from the current read_page_.
+  uint32_t read_page_rows_returned_;
+
+  /// Pointer into read_page_ to the byte after the last row read.
+  uint8_t* read_ptr_;
+
+  /// Pointer to one byte past the end of read_page_. Used to detect overruns.
+  const uint8_t* read_end_ptr_;
+
+  /// Pointer into write_page_ to the byte after the last row written.
+  uint8_t* write_ptr_;
+
+  /// Pointer to one byte past the end of write_page_. Cached to speed up computation
+  const uint8_t* write_end_ptr_;
+
+  /// Number of rows returned to the caller from GetNext() since the last
+  /// PrepareForRead() call.
+  int64_t rows_returned_;
+
+  /// True if there is currently an active write iterator into the stream.
+  bool has_write_iterator_;
+
+  /// The current page for writing. NULL if there is no write iterator or no current
+  /// write page. Always pinned. Size is 'default_page_len_', except temporarily while
+  /// appending a larger row between AddRowCustomBegin() and AddRowCustomEnd().
+  Page* write_page_;
+
+  /// Saved reservation for write iterator. 'default_page_len_' reservation is saved if
+  /// there is a write iterator, no page currently pinned for writing and the possibility
+  /// that a pin count will be needed for the write iterator in future. Specifically if:
+  /// * no rows have been appended to the stream and 'pages_' is empty, or
+  /// * the stream is unpinned, 'write_page_' is null and and the last page in 'pages_'
+  ///   is a large page that we advanced past, or
+  /// * there is only one pinned page in the stream and it is already pinned for reading.
+  BufferPool::SubReservation write_page_reservation_;
+
+  /// Total bytes of pinned pages in pages_, stored to avoid iterating over the list
+  /// to compute it.
+  int64_t bytes_pinned_;
+
+  /// Number of rows stored in the stream. Includes rows that were already deleted during
+  /// a destructive 'delete_on_read' pass over the stream.
+  int64_t num_rows_;
+
+  /// The default length in bytes of pages used to store the stream's rows. All rows that
+  /// fit in a default-sized page are stored in default-sized page.
+  const int64_t default_page_len_;
+
+  /// The maximum length in bytes of pages used to store the stream's rows. This is a
+  /// hard limit on the maximum size of row that can be stored in the stream and the
+  /// amount of reservation required to read or write to an unpinned stream.
+  const int64_t max_page_len_;
+
+  /// Whether any tuple in the rows is nullable.
+  const bool has_nullable_tuple_;
+
+  /// If true, pages are deleted after they are read during this read pass. Once rows
+  /// have been read from a stream with 'delete_on_read_' true, this is always true.
+  bool delete_on_read_;
+
+  bool closed_; // Used for debugging.
+
+  /// If true, this stream has been explicitly pinned by the caller and all pages are
+  /// kept pinned until the caller calls UnpinStream().
+  bool pinned_;
+
+  bool is_read_page(const Page* page) const {
+    return read_page_ != pages_.end() && &*read_page_ == page;
+  }
+
+  bool is_write_page(const Page* page) const { return write_page_ == page; }
+
+  /// Return true if the read and write page are the same.
+  bool has_read_write_page() const {
+    return write_page_ != nullptr && is_read_page(write_page_);
+  }
+
+  /// The slow path for AddRow() that is called if there is not sufficient space in
+  /// the current page.
+  bool AddRowSlow(TupleRow* row, Status* status) noexcept;
+
+  /// The slow path for AddRowCustomBegin() that is called if there is not sufficient space in
+  /// the current page.
+  uint8_t* AddRowCustomBeginSlow(int64_t size, Status* status) noexcept;
+
+  /// The slow path for AddRowCustomEnd() that is called for large pages.
+  void AddLargeRowCustomEnd(int64_t size) noexcept;
+
+  /// Copies 'row' into the buffer starting at *data and ending at the byte before
+  /// 'data_end'. On success, returns true and updates *data to point after the last
+  /// byte written. Returns false if there is not enough space in the buffer provided.
+  bool DeepCopy(TupleRow* row, uint8_t** data, const uint8_t* data_end) noexcept;
+
+  /// Templated implementation of DeepCopy().
+  template 
+  bool DeepCopyInternal(TupleRow* row, uint8_t** data, const uint8_t* data_end) noexcept;
+
+  /// Helper function to copy strings in string_slots from tuple into *data.
+  /// Updates *data to the end of the string data added. Returns false if the data
+  /// does not fit in the buffer [*data, data_end).
+  static bool CopyStrings(const Tuple* tuple,
+      const std::vector& string_slots, uint8_t** data,
+      const uint8_t* data_end);
+
+  /// Helper function to deep copy collections in collection_slots from tuple into
+  /// the buffer [*data, data_end). Updates *data to the end of the collection data
+  /// added. Returns false if the data does not fit in the buffer.
+  //static bool CopyCollections(const Tuple* tuple,
+  //    const std::vector& collection_slots, uint8_t** data,
+   //   const uint8_t* data_end);
+
+  /// Gets a new page of 'page_len' bytes from buffer_pool_, updating write_page_,
+  /// write_ptr_ and write_end_ptr_. The caller must ensure there is 'page_len' unused
+  /// reservation. The caller must reset the write page (if there is one) before calling.
+  Status NewWritePage(int64_t page_len) noexcept WARN_UNUSED_RESULT;
+
+  /// Determines what page size is needed to fit a row of 'row_size' bytes.
+  /// Returns an error if the row cannot fit in a page.
+  Status CalcPageLenForRow(int64_t row_size, int64_t* page_len);
+
+  /// Wrapper around NewWritePage() that allocates a new write page that fits a row of
+  /// 'row_size' bytes. Increases reservation if needed to allocate the next page.
+  /// Returns OK and sets 'got_reservation' to true if the write page was successfully
+  /// allocated. Returns an error if the row cannot fit in a page. Returns OK and sets
+  /// 'got_reservation' to false if the reservation could not be increased and no other
+  /// error was encountered.
+  Status AdvanceWritePage(
+      int64_t row_size, bool* got_reservation) noexcept WARN_UNUSED_RESULT;
+
+  /// Reset the write page, if there is one, and unpin pages accordingly. If there
+  /// is an active write iterator, the next row will be appended to a new page.
+  void ResetWritePage();
+
+  /// Invalidate the write iterator and release any resources associated with it. After
+  /// calling this, no more rows can be appended to the stream.
+  void InvalidateWriteIterator();
+
+  /// Same as PrepareForRead(), except the iterators are not invalidated and
+  /// the caller is assumed to have checked there is sufficient unused reservation.
+  Status PrepareForReadInternal(bool delete_on_read) WARN_UNUSED_RESULT;
+
+  /// Pins the next read page. This blocks reading from disk if necessary to bring the
+  /// page's data into memory. Updates read_page_, read_ptr_, and
+  /// read_page_rows_returned_.
+  Status NextReadPage() WARN_UNUSED_RESULT;
+
+  /// Invalidate the read iterator, and release any resources associated with the active
+  /// iterator.
+  void InvalidateReadIterator();
+
+  /// Returns the total additional bytes that this row will consume in write_page_ if
+  /// appended to the page. This includes the row's null indicators, the fixed length
+  /// part of the row and the data for inlined_string_slots_ and inlined_coll_slots_.
+  int64_t ComputeRowSize(TupleRow* row) const noexcept;
+
+  /// Pins page and updates tracking stats.
+  Status PinPage(Page* page) WARN_UNUSED_RESULT;
+
+  /// Increment the page's pin count if this page needs a higher pin count given the
+  /// current read and write iterator positions and whether the stream will be pinned
+  /// ('stream_pinned'). Assumes that no scenarios occur when the pin count needs to
+  /// be incremented multiple times. The caller is responsible for ensuring sufficient
+  /// reservation is available.
+  Status PinPageIfNeeded(Page* page, bool stream_pinned) WARN_UNUSED_RESULT;
+
+  /// Decrement the page's pin count if this page needs a lower pin count given the
+  /// current read and write iterator positions and whether the stream will be pinned
+  /// ('stream_pinned'). Assumes that no scenarios occur when the pin count needs to
+  /// be decremented multiple times.
+  void UnpinPageIfNeeded(Page* page, bool stream_pinned);
+
+  /// Return the expected pin count for 'page' in the current stream based on the current
+  /// read and write pages and whether the stream is pinned.
+  int ExpectedPinCount(bool stream_pinned, const Page* page) const;
+
+  /// Return true if the stream in its current state needs to have a reservation for
+  /// a write page stored in 'write_page_reservation_'.
+  bool NeedWriteReservation() const;
+
+  /// Same as above, except assume the stream's 'pinned_' state is 'stream_pinned'.
+  bool NeedWriteReservation(bool stream_pinned) const;
+
+  /// Same as above, except assume the stream has 'num_pages' pages and different
+  /// iterator state.
+  static bool NeedWriteReservation(bool stream_pinned, int64_t num_pages,
+      bool has_write_iterator, bool has_write_page, bool has_read_write_page);
+
+  /// Return true if the stream in its current state needs to have a reservation for
+  /// a read page stored in 'read_page_reservation_'.
+  bool NeedReadReservation() const;
+
+  /// Same as above, except assume the stream's 'pinned_' state is 'stream_pinned'.
+  bool NeedReadReservation(bool stream_pinned) const;
+
+  /// Same as above, except assume the stream has 'num_pages' pages and a different
+  /// read iterator state.
+  bool NeedReadReservation(bool stream_pinned, int64_t num_pages, bool has_read_iterator,
+      bool has_read_page) const;
+
+  /// Same as above, except assume the stream has 'num_pages' pages and a different
+  /// write iterator state.
+  static bool NeedReadReservation(bool stream_pinned, int64_t num_pages,
+      bool has_read_iterator, bool has_read_page, bool has_write_iterator,
+      bool has_write_page);
+
+  /// Templated GetNext implementations.
+  template 
+  Status GetNextInternal(RowBatch* batch, bool* eos, std::vector* flat_rows);
+  template 
+  Status GetNextInternal(RowBatch* batch, bool* eos, std::vector* flat_rows);
+
+  /// Helper function to convert a flattened TupleRow stored starting at '*data' into
+  /// 'row'. *data is updated to point to the first byte past the end of the row.
+  template 
+  void UnflattenTupleRow(uint8_t** data, TupleRow* row) const;
+
+  /// Helper function for GetNextInternal(). For each string slot in string_slots,
+  /// update StringValue's ptr field to point to the corresponding string data stored
+  /// inline in the stream (at the current value of read_ptr_) advance read_ptr_ by the
+  /// StringValue's length field.
+  void FixUpStringsForRead(const std::vector& string_slots, Tuple* tuple);
+
+  /// Helper function for GetNextInternal(). For each collection slot in collection_slots,
+  /// recursively update any pointers in the CollectionValue to point to the corresponding
+  /// var len data stored inline in the stream, advancing read_ptr_ as data is read.
+  /// Assumes that the collection was serialized to the stream in DeepCopy()'s format.
+  //void FixUpCollectionsForRead(
+  //    const std::vector& collection_slots, Tuple* tuple);
+
+  /// Returns the number of null indicator bytes per row. Only valid if this stream has
+  /// nullable tuples.
+  int NullIndicatorBytesPerRow() const;
+
+  /// Returns the total bytes pinned. Only called in DCHECKs to validate bytes_pinned_.
+  int64_t CalcBytesPinned() const;
+
+  /// DCHECKs if the stream is internally inconsistent. The stream should always be in
+  /// a consistent state after returning success from a public API call. The Fast version
+  /// has constant runtime and does not check all of 'pages_'. The Full version includes
+  /// O(n) checks that require iterating over the whole 'pages_' list (e.g. checking that
+  /// each page is in a valid state).
+  void CheckConsistencyFast() const;
+  void CheckConsistencyFull() const;
+  void CheckPageConsistency(const Page* page) const;
+};
+}
+
+#endif
diff --git a/be/src/runtime/buffered_tuple_stream3.inline.h b/be/src/runtime/buffered_tuple_stream3.inline.h
new file mode 100644
index 0000000000..c877052982
--- /dev/null
+++ b/be/src/runtime/buffered_tuple_stream3.inline.h
@@ -0,0 +1,59 @@
+// Modifications copyright (C) 2017, Baidu.com, Inc.
+// Copyright 2017 The Apache Software Foundation
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef BDG_PALO_BE_RUNTIME_BUFFERED_TUPLE_STREAM_INLINE_H
+#define BDG_PALO_BE_RUNTIME_BUFFERED_TUPLE_STREAM_INLINE_H
+
+#include "runtime/buffered_tuple_stream3.h"
+
+#include "runtime/descriptors.h"
+#include "runtime/tuple_row.h"
+#include "util/bit_util.h"
+
+namespace palo {
+
+inline int BufferedTupleStream3::NullIndicatorBytesPerRow() const {
+  DCHECK(has_nullable_tuple_);
+  return BitUtil::RoundUpNumBytes(fixed_tuple_sizes_.size());
+}
+
+inline uint8_t* BufferedTupleStream3::AddRowCustomBegin(int64_t size, Status* status) {
+  DCHECK(!closed_);
+  DCHECK(has_write_iterator());
+  if (UNLIKELY(write_page_ == nullptr || write_ptr_ + size > write_end_ptr_)) {
+    return AddRowCustomBeginSlow(size, status);
+  }
+  DCHECK(write_page_ != nullptr);
+  DCHECK(write_page_->is_pinned());
+  DCHECK_LE(write_ptr_ + size, write_end_ptr_);
+  ++num_rows_;
+  ++write_page_->num_rows;
+
+  uint8_t* data = write_ptr_;
+  write_ptr_ += size;
+  return data;
+}
+
+inline void BufferedTupleStream3::AddRowCustomEnd(int64_t size) {
+  if (UNLIKELY(size > default_page_len_)) AddLargeRowCustomEnd(size);
+}
+}
+
+#endif
diff --git a/be/src/runtime/bufferpool/buffer_allocator.cc b/be/src/runtime/bufferpool/buffer_allocator.cc
new file mode 100644
index 0000000000..be7f56235a
--- /dev/null
+++ b/be/src/runtime/bufferpool/buffer_allocator.cc
@@ -0,0 +1,738 @@
+// Modifications copyright (C) 2017, Baidu.com, Inc.
+// Copyright 2017 The Apache Software Foundation
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "runtime/bufferpool/buffer_allocator.h"
+
+#include 
+
+#include 
+
+#include "common/atomic.h"
+#include "runtime/bufferpool/system_allocator.h"
+#include "util/bit_util.h"
+#include "util/cpu_info.h"
+#include "util/pretty_printer.h"
+#include "util/runtime_profile.h"
+
+#include "common/names.h"
+#include "common/config.h"
+
+//DECLARE_bool(disable_mem_pools);
+
+namespace palo {
+
+/// Decrease 'bytes_remaining' by up to 'max_decrease', down to a minimum of 0.
+/// If 'require_full_decrease' is true, only decrease if we can decrease it
+/// 'max_decrease'. Returns the amount it was decreased by.
+static int64_t DecreaseBytesRemaining(
+    int64_t max_decrease, bool require_full_decrease, AtomicInt64* bytes_remaining);
+
+/// An arena containing free buffers and clean pages that are associated with a
+/// particular core. All public methods are thread-safe.
+class BufferPool::FreeBufferArena : public CacheLineAligned {
+ public:
+  FreeBufferArena(BufferAllocator* parent);
+
+  // Destructor should only run in backend tests.
+  ~FreeBufferArena();
+
+  /// Add a free buffer to the free lists. May free buffers to the system allocator
+  /// if the list becomes full. Caller should not hold 'lock_'
+  void AddFreeBuffer(BufferHandle&& buffer);
+
+  /// Try to get a free buffer of 'buffer_len' bytes from this arena. Returns true and
+  /// sets 'buffer' if found or false if not found. Caller should not hold 'lock_'.
+  bool PopFreeBuffer(int64_t buffer_len, BufferHandle* buffer);
+
+/*
+  /// Try to get a buffer of 'buffer_len' bytes from this arena by evicting a clean page.
+  /// Returns true and sets 'buffer' if a clean page was evicted or false otherwise.
+  /// Caller should not hold 'lock_'
+  bool EvictCleanPage(int64_t buffer_len, BufferHandle* buffer);
+*/
+  /// Try to free 'target_bytes' of memory from this arena back to the system allocator.
+  /// Up to 'target_bytes_to_claim' will be given back to the caller, so it can allocate
+  /// a buffer of that size from the system. Any bytes freed in excess of
+  /// 'target_bytes_to_claim' are added to 'system_bytes_remaining_'. Returns the actual
+  /// number of bytes freed and the actual number of bytes claimed.
+  ///
+  /// Caller should not hold 'lock_'. If 'arena_lock' is non-null, ownership of the
+  /// arena lock is transferred to the caller. Uses std::unique_lock instead of
+  /// boost::unique_lock because it is movable.
+  std::pair FreeSystemMemory(int64_t target_bytes_to_free,
+      int64_t target_bytes_to_claim, std::unique_lock* arena_lock);
+
+  /// Add a clean page to the arena. Caller must hold the page's client's lock and not
+  /// hold 'lock_' or any Page::lock_.
+  void AddCleanPage(Page* page);
+
+  /// Removes the clean page from the arena if present. Returns true if removed. If
+  /// 'claim_buffer' is true, the buffer is returned with the page, otherwise it is
+  /// added to the free buffer list. Caller must hold the page's client's lock and
+  /// not hold 'lock_' or any Page::lock_.
+  bool RemoveCleanPage(bool claim_buffer, Page* page);
+
+  /// Called periodically. Shrinks free lists that are holding onto more memory than
+  /// needed.
+  void Maintenance();
+
+  /// Test helper: gets the current size of the free list for buffers of 'len' bytes
+  /// on core 'core'.
+  int GetFreeListSize(int64_t len);
+
+  /// Return the total number of free buffers in the arena. May be approximate since
+  /// it doesn't acquire the arena lock.
+  int64_t GetNumFreeBuffers();
+
+  /// Return the total bytes of free buffers in the arena. May be approximate since
+  /// it doesn't acquire the arena lock.
+  int64_t GetFreeBufferBytes();
+
+  /// Return the total number of clean pages in the arena. May be approximate since
+  /// it doesn't acquire the arena lock.
+  int64_t GetNumCleanPages();
+
+  string DebugString();
+
+ private:
+  /// The data structures for each power-of-two size of buffers/pages.
+  /// All members are protected by FreeBufferArena::lock_ unless otherwise mentioned.
+  struct PerSizeLists {
+    PerSizeLists() : num_free_buffers(0), low_water_mark(0), num_clean_pages(0) {}
+
+    /// Helper to add a free buffer and increment the counter.
+    /// FreeBufferArena::lock_ must be held by the caller.
+    void AddFreeBuffer(BufferHandle&& buffer) {
+      DCHECK_EQ(num_free_buffers.load(), free_buffers.Size());
+      num_free_buffers.add(1);
+      free_buffers.AddFreeBuffer(move(buffer));
+    }
+
+    /// The number of entries in 'free_buffers'. Can be read without holding a lock to
+    /// allow threads to quickly skip over empty lists when trying to find a buffer.
+    AtomicInt64 num_free_buffers;
+
+    /// Buffers that are not in use that were originally allocated on the core
+    /// corresponding to this arena.
+    FreeList free_buffers;
+
+    /// The minimum size of 'free_buffers' since the last Maintenance() call.
+    int low_water_mark;
+
+    /// The number of entries in 'clean_pages'.
+    /// Can be read without holding a lock to allow threads to quickly skip over empty
+    /// lists when trying to find a buffer in a different arena.
+    AtomicInt64 num_clean_pages;
+
+    /// Unpinned pages that have had their contents written to disk. These pages can be
+    /// evicted to reclaim a buffer for any client. Pages are evicted in FIFO order,
+    /// so that pages are evicted in approximately the same order that the clients wrote
+    /// them to disk. Protected by FreeBufferArena::lock_.
+    InternalList clean_pages;
+  };
+
+  /// Return the number of buffer sizes for this allocator.
+  int NumBufferSizes() const {
+    return parent_->log_max_buffer_len_ - parent_->log_min_buffer_len_ + 1;
+  }
+
+  /// Return the lists of buffers for buffers of the given length.
+  PerSizeLists* GetListsForSize(int64_t buffer_len) {
+    DCHECK(BitUtil::IsPowerOf2(buffer_len));
+    int idx = BitUtil::Log2Ceiling64(buffer_len) - parent_->log_min_buffer_len_;
+    DCHECK_LT(idx, NumBufferSizes());
+    return &buffer_sizes_[idx];
+  }
+
+  /// Compute a sum over all the lists in the arena. Does not lock the arena.
+  int64_t SumOverSizes(
+      std::function compute_fn);
+
+  BufferAllocator* const parent_;
+
+  /// Protects all data structures in the arena. See buffer-pool-internal.h for lock
+  /// order.
+  SpinLock lock_;
+
+  /// Free buffers and clean pages for each buffer size for this arena.
+  /// Indexed by log2(bytes) - log2(min_buffer_len_).
+  PerSizeLists buffer_sizes_[LOG_MAX_BUFFER_BYTES + 1];
+};
+
+int64_t BufferPool::BufferAllocator::CalcMaxBufferLen(
+    int64_t min_buffer_len, int64_t system_bytes_limit) {
+  // Find largest power of 2 smaller than 'system_bytes_limit'.
+  int64_t upper_bound = system_bytes_limit == 0 ? 1L : 1L
+          << BitUtil::Log2Floor64(system_bytes_limit);
+  upper_bound = min(MAX_BUFFER_BYTES, upper_bound);
+  return max(min_buffer_len, upper_bound); // Can't be < min_buffer_len.
+}
+
+BufferPool::BufferAllocator::BufferAllocator(
+    BufferPool* pool, int64_t min_buffer_len, int64_t system_bytes_limit,
+    int64_t clean_page_bytes_limit)
+  : pool_(pool),
+    system_allocator_(new SystemAllocator(min_buffer_len)),
+    min_buffer_len_(min_buffer_len),
+    max_buffer_len_(CalcMaxBufferLen(min_buffer_len, system_bytes_limit)),
+    log_min_buffer_len_(BitUtil::Log2Ceiling64(min_buffer_len_)),
+    log_max_buffer_len_(BitUtil::Log2Ceiling64(max_buffer_len_)),
+    system_bytes_limit_(system_bytes_limit),
+    system_bytes_remaining_(system_bytes_limit),
+    clean_page_bytes_limit_(clean_page_bytes_limit),
+    clean_page_bytes_remaining_(clean_page_bytes_limit),
+    per_core_arenas_(CpuInfo::get_max_num_cores()),
+    max_scavenge_attempts_(MAX_SCAVENGE_ATTEMPTS) {
+  DCHECK(BitUtil::IsPowerOf2(min_buffer_len_)) << min_buffer_len_;
+  DCHECK(BitUtil::IsPowerOf2(max_buffer_len_)) << max_buffer_len_;
+  DCHECK_LE(0, min_buffer_len_);
+  DCHECK_LE(min_buffer_len_, max_buffer_len_);
+  DCHECK_LE(max_buffer_len_, MAX_BUFFER_BYTES);
+  DCHECK_LE(max_buffer_len_, max(system_bytes_limit_, min_buffer_len_));
+
+  for (unique_ptr& arena : per_core_arenas_) {
+    arena.reset(new FreeBufferArena(this));
+  }
+}
+
+BufferPool::BufferAllocator::~BufferAllocator() {
+  per_core_arenas_.clear(); // Release all the memory.
+  // Check for accounting leaks.
+  DCHECK_EQ(system_bytes_limit_, system_bytes_remaining_.load());
+  DCHECK_EQ(clean_page_bytes_limit_, clean_page_bytes_remaining_.load());
+}
+
+Status BufferPool::BufferAllocator::Allocate(
+    ClientHandle* client, int64_t len, BufferHandle* buffer) {
+  SCOPED_TIMER(client->impl_->counters().alloc_time);
+  COUNTER_UPDATE(client->impl_->counters().cumulative_bytes_alloced, len);
+  COUNTER_UPDATE(client->impl_->counters().cumulative_allocations, 1);
+
+  RETURN_IF_ERROR(AllocateInternal(len, buffer));
+  DCHECK(buffer->is_open());
+  buffer->client_ = client;
+  return Status::OK;
+}
+
+Status BufferPool::BufferAllocator::AllocateInternal(int64_t len, BufferHandle* buffer) {
+  DCHECK(!buffer->is_open());
+  DCHECK_GE(len, min_buffer_len_);
+  DCHECK(BitUtil::IsPowerOf2(len)) << len;
+
+  std::stringstream err_stream;
+  if (UNLIKELY(len > MAX_BUFFER_BYTES)) {
+    err_stream << "Tried to allocate buffer of " << len << " bytes"
+               << " max of " << MAX_BUFFER_BYTES << " bytes";
+    return Status(err_stream.str());
+  }
+  if (UNLIKELY(len > system_bytes_limit_)) {
+    err_stream << "Tried to allocate buffer of " << len << " bytes"
+               << " > buffer pool limit of  " << MAX_BUFFER_BYTES << " bytes";
+    return Status(err_stream.str());
+  }
+
+  const int current_core = CpuInfo::get_current_core();
+  // Fast path: recycle a buffer of the correct size from this core's arena.
+  FreeBufferArena* current_core_arena = per_core_arenas_[current_core].get();
+  if (current_core_arena->PopFreeBuffer(len, buffer)) return Status::OK;
+
+  // Fast-ish path: allocate a new buffer if there is room in 'system_bytes_remaining_'.
+  int64_t delta = DecreaseBytesRemaining(len, true, &system_bytes_remaining_);
+  if (delta != len) {
+    DCHECK_EQ(0, delta);
+    const vector& numa_node_cores = CpuInfo::get_cores_of_same_numa_node(current_core);
+    const int numa_node_core_idx = CpuInfo::get_numa_node_core_idx(current_core);
+
+    // Fast-ish path: find a buffer of the right size from another core on the same
+    // NUMA node. Avoid getting a buffer from another NUMA node - prefer reclaiming
+    // a clean page on this NUMA node or scavenging then reallocating a new buffer.
+    // We don't want to get into a state where allocations between the nodes are
+    // unbalanced and one node is stuck reusing memory allocated on the other node.
+    for (int i = 1; i < numa_node_cores.size(); ++i) {
+      // Each core should start searching from a different point to avoid hot-spots.
+      int other_core = numa_node_cores[(numa_node_core_idx + i) % numa_node_cores.size()];
+      FreeBufferArena* other_core_arena = per_core_arenas_[other_core].get();
+      if (other_core_arena->PopFreeBuffer(len, buffer)) return Status::OK;
+    }
+
+/*
+    // Fast-ish path: evict a clean page of the right size from the current NUMA node.
+    for (int i = 0; i < numa_node_cores.size(); ++i) {
+      int other_core = numa_node_cores[(numa_node_core_idx + i) % numa_node_cores.size()];
+      FreeBufferArena* other_core_arena = per_core_arenas_[other_core].get();
+      if (other_core_arena->EvictCleanPage(len, buffer)) return Status::OK;
+    }
+*/
+    // Slow path: scavenge buffers of different sizes from free buffer lists and clean
+    // pages. Make initial, fast attempts to gather the required buffers, before
+    // finally making a slower, but guaranteed-to-succeed attempt.
+    // TODO: IMPALA-4703: add a stress option where we vary the number of attempts
+    // randomly.
+    int attempt = 0;
+    while (attempt < max_scavenge_attempts_ && delta < len) {
+      bool final_attempt = attempt == max_scavenge_attempts_ - 1;
+      delta += ScavengeBuffers(final_attempt, current_core, len - delta);
+      ++attempt;
+    }
+    if (delta < len) {
+      system_bytes_remaining_.add(delta);
+      // This indicates an accounting bug - we should be able to always get the memory.
+      std::stringstream err_stream;
+      err_stream << "Could not allocate : " << len 
+                 <<  "bytes: was only able to free up "
+                 << delta << " bytes after " << max_scavenge_attempts_
+                 << " attempts:\n" << pool_->DebugString();
+      return Status(err_stream.str());
+    }
+  }
+  // We have headroom to allocate a new buffer at this point.
+  DCHECK_EQ(delta, len);
+  Status status = system_allocator_->Allocate(len, buffer);
+  if (!status.ok()) {
+    system_bytes_remaining_.add(len);
+    return status;
+  }
+  return Status::OK;
+}
+
+int64_t DecreaseBytesRemaining(
+    int64_t max_decrease, bool require_full_decrease, AtomicInt64* bytes_remaining) {
+  while (true) {
+    int64_t old_value = bytes_remaining->load();
+    if (require_full_decrease && old_value < max_decrease) return 0;
+    int64_t decrease = min(old_value, max_decrease);
+    int64_t new_value = old_value - decrease;
+    if (bytes_remaining->compare_and_swap(old_value, new_value)) {
+      return decrease;
+    }
+  }
+}
+
+int64_t BufferPool::BufferAllocator::ScavengeBuffers(
+    bool slow_but_sure, int current_core, int64_t target_bytes) {
+  // There are two strategies for scavenging buffers:
+  // 1) Fast, opportunistic: Each arena is searched in succession. Although reservations
+  //    guarantee that the memory we need is available somewhere, this may fail if we
+  //    we race with another thread that returned buffers to an arena that we've already
+  //    searched and took the buffers from an arena we haven't yet searched.
+  // 2) Slow, guaranteed to succeed: In order to ensure that we can find the memory in a
+  //    single pass, we hold locks for all arenas we've already examined. That way, other
+  //    threads can't take the memory that we need from an arena that we haven't yet
+  //    examined (or from 'system_bytes_available_') because in order to do so, it would
+  //    have had to return the equivalent amount of memory to an earlier arena or added
+  //    it back into 'systems_bytes_reamining_'. The former can't happen since we're
+  //    still holding those locks, and the latter is solved by trying to decrease
+  //    system_bytes_remaining_ with DecreaseBytesRemaining() at the end.
+  DCHECK_GT(target_bytes, 0);
+  // First make sure we've used up all the headroom in the buffer limit.
+  int64_t bytes_found =
+      DecreaseBytesRemaining(target_bytes, false, &system_bytes_remaining_);
+  if (bytes_found == target_bytes) return bytes_found;
+
+  // In 'slow_but_sure' mode, we will hold locks for multiple arenas at the same time and
+  // therefore must start at 0 to respect the lock order. Otherwise we start with the
+  // current core's arena for locality and to avoid excessive contention on arena 0.
+  int start_core = slow_but_sure ? 0 : current_core;
+  vector> arena_locks;
+  if (slow_but_sure) arena_locks.resize(per_core_arenas_.size());
+
+  for (int i = 0; i < per_core_arenas_.size(); ++i) {
+    int core_to_check = (start_core + i) % per_core_arenas_.size();
+    FreeBufferArena* arena = per_core_arenas_[core_to_check].get();
+    int64_t bytes_needed = target_bytes - bytes_found;
+    bytes_found += arena->FreeSystemMemory(bytes_needed, bytes_needed,
+         slow_but_sure ? &arena_locks[i] : nullptr).second;
+    if (bytes_found == target_bytes) break;
+  }
+  DCHECK_LE(bytes_found, target_bytes);
+
+  // Decrement 'system_bytes_remaining_' while still holding the arena locks to avoid
+  // the window for a race with another thread that removes a buffer from a list and
+  // then increments 'system_bytes_remaining_'. The race is prevented because the other
+  // thread holds the lock while decrementing 'system_bytes_remaining_' in the cases
+  // where it may not have reservation corresponding to that memory.
+  if (slow_but_sure && bytes_found < target_bytes) {
+    bytes_found += DecreaseBytesRemaining(
+        target_bytes - bytes_found, true, &system_bytes_remaining_);
+    DCHECK_EQ(bytes_found, target_bytes) << DebugString();
+  }
+  return bytes_found;
+}
+
+void BufferPool::BufferAllocator::Free(BufferHandle&& handle) {
+  DCHECK(handle.is_open());
+  handle.client_ = nullptr; // Buffer is no longer associated with a client.
+  FreeBufferArena* arena = per_core_arenas_[handle.home_core_].get();
+  handle.Poison();
+  arena->AddFreeBuffer(move(handle));
+}
+
+void BufferPool::BufferAllocator::AddCleanPage(
+    const unique_lock& client_lock, Page* page) {
+  page->client->DCheckHoldsLock(client_lock);
+  FreeBufferArena* arena = per_core_arenas_[page->buffer.home_core_].get();
+  arena->AddCleanPage(page);
+}
+
+
+bool BufferPool::BufferAllocator::RemoveCleanPage(
+    const unique_lock& client_lock, bool claim_buffer, Page* page) {
+  page->client->DCheckHoldsLock(client_lock);
+  FreeBufferArena* arena;
+  {
+    lock_guard pl(page->buffer_lock);
+    // Page may be evicted - in which case it has no home core and is not in an arena.
+    if (!page->buffer.is_open()) return false;
+    arena = per_core_arenas_[page->buffer.home_core_].get();
+  }
+  return arena->RemoveCleanPage(claim_buffer, page);
+}
+
+void BufferPool::BufferAllocator::Maintenance() {
+  for (unique_ptr& arena : per_core_arenas_) arena->Maintenance();
+}
+
+void BufferPool::BufferAllocator::ReleaseMemory(int64_t bytes_to_free) {
+  int64_t bytes_freed = 0;
+  int current_core = CpuInfo::get_current_core();
+  for (int i = 0; i < per_core_arenas_.size(); ++i) {
+    int core_to_check = (current_core + i) % per_core_arenas_.size();
+    FreeBufferArena* arena = per_core_arenas_[core_to_check].get();
+    // Free but don't claim any memory.
+    bytes_freed += arena->FreeSystemMemory(bytes_to_free - bytes_freed, 0, nullptr).first;
+    if (bytes_freed >= bytes_to_free) return;
+  }
+}
+
+int BufferPool::BufferAllocator::GetFreeListSize(int core, int64_t len) {
+  return per_core_arenas_[core]->GetFreeListSize(len);
+}
+
+int64_t BufferPool::BufferAllocator::FreeToSystem(vector&& buffers) {
+  int64_t bytes_freed = 0;
+  for (BufferHandle& buffer : buffers) {
+    bytes_freed += buffer.len();
+    // Ensure that the memory is unpoisoned when it's next allocated by the system.
+    buffer.Unpoison();
+    system_allocator_->Free(move(buffer));
+  }
+  return bytes_freed;
+}
+
+int64_t BufferPool::BufferAllocator::SumOverArenas(
+    std::function compute_fn) const {
+  int64_t total = 0;
+  for (const unique_ptr& arena : per_core_arenas_) {
+    total += compute_fn(arena.get());
+  }
+  return total;
+}
+
+int64_t BufferPool::BufferAllocator::GetNumFreeBuffers() const {
+  return SumOverArenas([](FreeBufferArena* arena) { return arena->GetNumFreeBuffers(); });
+}
+
+int64_t BufferPool::BufferAllocator::GetFreeBufferBytes() const {
+  return SumOverArenas(
+      [](FreeBufferArena* arena) { return arena->GetFreeBufferBytes(); });
+}
+
+int64_t BufferPool::BufferAllocator::GetNumCleanPages() const {
+  return SumOverArenas([](FreeBufferArena* arena) { return arena->GetNumCleanPages(); });
+}
+
+int64_t BufferPool::BufferAllocator::GetCleanPageBytesLimit() const {
+  return clean_page_bytes_limit_;
+}
+
+int64_t BufferPool::BufferAllocator::GetCleanPageBytes() const {
+  return clean_page_bytes_limit_ - clean_page_bytes_remaining_.load();
+}
+
+string BufferPool::BufferAllocator::DebugString() {
+  stringstream ss;
+  ss << " " << this << " min_buffer_len: " << min_buffer_len_
+     << " system_bytes_limit: " << system_bytes_limit_
+     << " system_bytes_remaining: " << system_bytes_remaining_.load() << "\n"
+     << " clean_page_bytes_limit: " << clean_page_bytes_limit_
+     << " clean_page_bytes_remaining: " << clean_page_bytes_remaining_.load() << "\n";
+  for (int i = 0; i < per_core_arenas_.size(); ++i) {
+    ss << "  Arena " << i << " " << per_core_arenas_[i]->DebugString() << "\n";
+  }
+  return ss.str();
+}
+
+BufferPool::FreeBufferArena::FreeBufferArena(BufferAllocator* parent) : parent_(parent) {}
+
+BufferPool::FreeBufferArena::~FreeBufferArena() {
+  for (int i = 0; i < NumBufferSizes(); ++i) {
+    // Clear out the free lists.
+    FreeList* list = &buffer_sizes_[i].free_buffers;
+    vector buffers = list->GetBuffersToFree(list->Size());
+    parent_->system_bytes_remaining_.add(parent_->FreeToSystem(move(buffers)));
+
+    // All pages should have been destroyed.
+    DCHECK_EQ(0, buffer_sizes_[i].clean_pages.size());
+  }
+}
+
+void BufferPool::FreeBufferArena::AddFreeBuffer(BufferHandle&& buffer) {
+  lock_guard al(lock_);
+  if (config::FLAGS_disable_mem_pools) {
+    int64_t len = buffer.len();
+    parent_->system_allocator_->Free(move(buffer));
+    parent_->system_bytes_remaining_.add(len);
+    return;
+  }
+  PerSizeLists* lists = GetListsForSize(buffer.len());
+  lists->AddFreeBuffer(move(buffer));
+}
+
+bool BufferPool::FreeBufferArena::RemoveCleanPage(bool claim_buffer, Page* page) {
+  lock_guard al(lock_);
+  PerSizeLists* lists = GetListsForSize(page->len);
+  DCHECK_EQ(lists->num_clean_pages.load(), lists->clean_pages.size());
+  if (!lists->clean_pages.remove(page)) return false;
+  lists->num_clean_pages.add(-1);
+  parent_->clean_page_bytes_remaining_.add(page->len);
+  if (!claim_buffer) {
+    BufferHandle buffer;
+    {
+      lock_guard pl(page->buffer_lock);
+      buffer = move(page->buffer);
+    }
+    lists->AddFreeBuffer(move(buffer));
+  }
+  return true;
+}
+
+bool BufferPool::FreeBufferArena::PopFreeBuffer(
+    int64_t buffer_len, BufferHandle* buffer) {
+  PerSizeLists* lists = GetListsForSize(buffer_len);
+  // Check before acquiring lock.
+  if (lists->num_free_buffers.load() == 0) return false;
+
+  lock_guard al(lock_);
+  FreeList* list = &lists->free_buffers;
+  DCHECK_EQ(lists->num_free_buffers.load(), list->Size());
+  if (!list->PopFreeBuffer(buffer)) return false;
+  buffer->Unpoison();
+  lists->num_free_buffers.add(-1);
+  lists->low_water_mark = min(lists->low_water_mark, list->Size());
+  return true;
+}
+/*
+bool BufferPool::FreeBufferArena::EvictCleanPage(
+    int64_t buffer_len, BufferHandle* buffer) {
+  PerSizeLists* lists = GetListsForSize(buffer_len);
+  // Check before acquiring lock.
+  if (lists->num_clean_pages.Load() == 0) return false;
+
+  lock_guard al(lock_);
+  DCHECK_EQ(lists->num_clean_pages.Load(), lists->clean_pages.size());
+  Page* page = lists->clean_pages.dequeue();
+  if (page == nullptr) return false;
+  lists->num_clean_pages.Add(-1);
+  parent_->clean_page_bytes_remaining_.Add(buffer_len);
+  lock_guard pl(page->buffer_lock);
+  *buffer = move(page->buffer);
+  return true;
+}
+*/
+std::pair BufferPool::FreeBufferArena::FreeSystemMemory(
+    int64_t target_bytes_to_free, int64_t target_bytes_to_claim,
+    std::unique_lock* arena_lock) {
+  DCHECK_GT(target_bytes_to_free, 0);
+  DCHECK_GE(target_bytes_to_free, target_bytes_to_claim);
+  int64_t bytes_freed = 0;
+  // If the caller is acquiring the lock, just lock for the whole method.
+  // Otherwise lazily acquire the lock the first time we find some memory
+  // to free.
+  std::unique_lock al(lock_, std::defer_lock_t());
+  if (arena_lock != nullptr) al.lock();
+
+  vector buffers;
+  // Search from largest to smallest to avoid freeing many small buffers unless
+  // necessary.
+  for (int i = NumBufferSizes() - 1; i >= 0; --i) {
+    PerSizeLists* lists = &buffer_sizes_[i];
+    // Check before acquiring lock to avoid expensive lock acquisition and make scanning
+    // empty lists much cheaper.
+    if (lists->num_free_buffers.load() == 0 && lists->num_clean_pages.load() == 0) {
+      continue;
+    }
+    if (!al.owns_lock()) al.lock();
+    FreeList* free_buffers = &lists->free_buffers;
+    InternalList* clean_pages = &lists->clean_pages;
+    DCHECK_EQ(lists->num_free_buffers.load(), free_buffers->Size());
+    DCHECK_EQ(lists->num_clean_pages.load(), clean_pages->size());
+
+    // Figure out how many of the buffers in the free list we should free.
+    DCHECK_GT(target_bytes_to_free, bytes_freed);
+    const int64_t buffer_len = 1L << (i + parent_->log_min_buffer_len_);
+    int64_t buffers_to_free = min(free_buffers->Size(),
+        BitUtil::Ceil(target_bytes_to_free - bytes_freed, buffer_len));
+    int64_t buffer_bytes_to_free = buffers_to_free * buffer_len;
+
+    // Evict clean pages by moving their buffers to the free page list before freeing
+    // them. This ensures that they are freed based on memory address in the expected
+    // order.
+    int num_pages_evicted = 0;
+    int64_t page_bytes_evicted = 0;
+    while (bytes_freed + buffer_bytes_to_free < target_bytes_to_free) {
+      Page* page = clean_pages->dequeue();
+      if (page == nullptr) break;
+      BufferHandle page_buffer;
+      {
+        lock_guard pl(page->buffer_lock);
+        page_buffer = move(page->buffer);
+      }
+      ++buffers_to_free;
+      buffer_bytes_to_free += page_buffer.len();
+      ++num_pages_evicted;
+      page_bytes_evicted += page_buffer.len();
+      free_buffers->AddFreeBuffer(move(page_buffer));
+    }
+    lists->num_free_buffers.add(num_pages_evicted);
+    lists->num_clean_pages.add(-num_pages_evicted);
+    parent_->clean_page_bytes_remaining_.add(page_bytes_evicted);
+
+    if (buffers_to_free > 0) {
+      int64_t buffer_bytes_freed =
+          parent_->FreeToSystem(free_buffers->GetBuffersToFree(buffers_to_free));
+      DCHECK_EQ(buffer_bytes_to_free, buffer_bytes_freed);
+      bytes_freed += buffer_bytes_to_free;
+      lists->num_free_buffers.add(-buffers_to_free);
+      lists->low_water_mark = min(lists->low_water_mark, free_buffers->Size());
+      if (bytes_freed >= target_bytes_to_free) break;
+    }
+    // Should have cleared out all lists if we don't have enough memory at this point.
+    DCHECK_EQ(0, free_buffers->Size());
+    DCHECK_EQ(0, clean_pages->size());
+  }
+  int64_t bytes_claimed = min(bytes_freed, target_bytes_to_claim);
+  if (bytes_freed > bytes_claimed) {
+    // Add back the extra for other threads before releasing the lock to avoid race
+    // where the other thread may not be able to find enough buffers.
+    parent_->system_bytes_remaining_.add(bytes_freed - bytes_claimed);
+  }
+  if (arena_lock != nullptr) *arena_lock = move(al);
+  return std::make_pair(bytes_freed, bytes_claimed);
+}
+
+void BufferPool::FreeBufferArena::AddCleanPage(Page* page) {
+  bool eviction_needed = config::FLAGS_disable_mem_pools
+    || DecreaseBytesRemaining(
+        page->len, true, &parent_->clean_page_bytes_remaining_) == 0;
+  lock_guard al(lock_);
+  PerSizeLists* lists = GetListsForSize(page->len);
+  DCHECK_EQ(lists->num_clean_pages.load(), lists->clean_pages.size());
+  if (eviction_needed) {
+    if (lists->clean_pages.empty()) {
+      // No other pages to evict, must evict 'page' instead of adding it.
+      lists->AddFreeBuffer(move(page->buffer));
+    } else {
+      // Evict an older page (FIFO eviction) to make space for this one.
+      Page* page_to_evict = lists->clean_pages.dequeue();
+      lists->clean_pages.enqueue(page);
+      BufferHandle page_to_evict_buffer;
+      {
+        lock_guard pl(page_to_evict->buffer_lock);
+        page_to_evict_buffer = move(page_to_evict->buffer);
+      }
+      lists->AddFreeBuffer(move(page_to_evict_buffer));
+    }
+  } else {
+    lists->clean_pages.enqueue(page);
+    lists->num_clean_pages.add(1);
+  }
+}
+
+void BufferPool::FreeBufferArena::Maintenance() {
+  lock_guard al(lock_);
+  for (int i = 0; i < NumBufferSizes(); ++i) {
+    PerSizeLists* lists = &buffer_sizes_[i];
+    DCHECK_LE(lists->low_water_mark, lists->free_buffers.Size());
+    if (lists->low_water_mark != 0) {
+      // We haven't needed the buffers below the low water mark since the previous
+      // Maintenance() call. Discard half of them to free up memory. By always discarding
+      // at least one, we guarantee that an idle list will shrink to zero entries.
+      int num_to_free = max(1, lists->low_water_mark / 2);
+      parent_->system_bytes_remaining_.add(
+          parent_->FreeToSystem(lists->free_buffers.GetBuffersToFree(num_to_free)));
+      lists->num_free_buffers.add(-num_to_free);
+    }
+    lists->low_water_mark = lists->free_buffers.Size();
+  }
+}
+
+int BufferPool::FreeBufferArena::GetFreeListSize(int64_t len) {
+  lock_guard al(lock_);
+  PerSizeLists* lists = GetListsForSize(len);
+  DCHECK_EQ(lists->num_free_buffers.load(), lists->free_buffers.Size());
+  return lists->free_buffers.Size();
+}
+
+int64_t BufferPool::FreeBufferArena::SumOverSizes(
+    std::function compute_fn) {
+  int64_t total = 0;
+  for (int i = 0; i < NumBufferSizes(); ++i) {
+    int64_t buffer_size = (1L << i) * parent_->min_buffer_len_;
+    total += compute_fn(&buffer_sizes_[i], buffer_size);
+  }
+  return total;
+}
+
+int64_t BufferPool::FreeBufferArena::GetNumFreeBuffers() {
+  return SumOverSizes([](PerSizeLists* lists, int64_t buffer_size) {
+    return lists->num_free_buffers.load();
+  });
+}
+
+int64_t BufferPool::FreeBufferArena::GetFreeBufferBytes() {
+  return SumOverSizes([](PerSizeLists* lists, int64_t buffer_size) {
+    return lists->num_free_buffers.load() * buffer_size;
+  });
+}
+
+int64_t BufferPool::FreeBufferArena::GetNumCleanPages() {
+  return SumOverSizes([](PerSizeLists* lists, int64_t buffer_size) {
+    return lists->num_clean_pages.load();
+  });
+}
+
+string BufferPool::FreeBufferArena::DebugString() {
+  lock_guard al(lock_);
+  stringstream ss;
+  ss << " " << this << "\n";
+  for (int i = 0; i < NumBufferSizes(); ++i) {
+    int64_t buffer_len = 1L << (parent_->log_min_buffer_len_ + i);
+    PerSizeLists& lists = buffer_sizes_[i];
+    ss << "  " << PrettyPrinter::print_bytes(buffer_len) << ":"
+       << " free buffers: " << lists.num_free_buffers.load()
+       << " low water mark: " << lists.low_water_mark
+       << " clean pages: " << lists.num_clean_pages.load() << " ";
+    lists.clean_pages.iterate(bind(Page::DebugStringCallback, &ss, _1));
+
+    ss << "\n";
+  }
+  return ss.str();
+}
+}
diff --git a/be/src/runtime/bufferpool/buffer_allocator.h b/be/src/runtime/bufferpool/buffer_allocator.h
new file mode 100644
index 0000000000..ade8d8ff04
--- /dev/null
+++ b/be/src/runtime/bufferpool/buffer_allocator.h
@@ -0,0 +1,248 @@
+// Modifications copyright (C) 2017, Baidu.com, Inc.
+// Copyright 2017 The Apache Software Foundation
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef BDG_PALO_BE_RUNTIME_BUFFER_ALLOCATOR_H
+#define BDG_PALO_BE_RUNTIME_BUFFER_ALLOCATOR_H
+
+#include 
+
+#include "runtime/bufferpool/buffer_pool_internal.h"
+#include "runtime/bufferpool/free_list.h"
+#include "util/aligned_new.h"
+
+namespace palo {
+
+/// The internal buffer allocator used by BufferPool to allocator power-of-two sized
+/// buffers. BufferAllocator builds on top of SystemAllocator by adding caching of
+/// free buffers and clean pages where the memory is not currently in use by a client
+/// but has not yet been released to SystemAllocator.
+///
+/// The allocator is optimised for the common case where an allocation can be served
+/// by reclaiming a buffer of the request size from the current core's arena. In this
+/// case there is no contention for locks between concurrently-running threads. If this
+/// fails, progressively more expensive approaches to allocate memory are tried until
+/// the allocation eventually success (see AllocateInternal() for details).
+///
+/// Buffer Reservations
+/// ===================
+/// The implementation of the BufferAllocator relies on the BufferPool's reservation
+/// tracking system. The allocator is given a hard limit ('system_bytes_limit'), above
+/// which all allocations will fail. Allocations up to 'system_bytes_limit' are
+/// guaranteed to succeed unless an unexpected system error occurs (e.g. we can't allocate
+/// all of the required memory from the OS). Reservations must be set up so that the total
+/// of all reservations does not exceed 'system_bytes_limit', thus ensuring that
+/// BufferAllocator can alway find memory to fulfill reservations.
+///
+/// +========================+
+/// | IMPLEMENTATION NOTES   |
+/// +========================+
+///
+/// Memory
+/// ======
+/// Memory managed by BufferAllocator comes in four forms:
+/// 1. Buffers returned to the client (corresponding to a used reservation)
+/// 2. Free buffers cached in the BufferAllocator's free lists.
+/// 3. Buffers attached to clean unpinned pages in the BufferAllocator's clean page lists.
+/// 4. Bytes that are not allocated from the system: 'system_bytes_remaining_'.
+/// Together these always add up to 'system_bytes_limit', which allows BufferAllocator
+/// to always fulfill reservations via some combination of memory in forms 2, 3 or 4.
+///
+/// The BufferAllocator code is careful not to make memory inaccessible to concurrently
+/// executing threads that are entitled to it. E.g. if one thread is entitled to allocate
+/// a 1MB buffer from the BufferAllocator's free or clean page lists but needs to release
+/// a 2MB buffer to the system to free up enough memory, it must add 1MB to
+/// 'system_bytes_remaining_' in the same critical section in which it freed the 2MB
+/// buffer. Otherwise a concurrent thread that had a reservation for 1MB of memory might
+/// not be able to find it.
+///
+/// Arenas
+/// ======
+/// The buffer allocator's data structures are broken up into arenas, with an arena per
+/// core. Within each arena, each buffer or page is stored in a list with buffers and
+/// pages of the same size: there is a separate list for every power-of-two size. Each
+/// arena is protected by a separate lock, so in the common case where threads are able
+/// to fulfill allocations from their own arena, there will be no lock contention.
+///
+struct BufferPool::BufferAllocator {
+  
+  BufferAllocator(BufferPool* pool, int64_t min_buffer_len, int64_t system_bytes_limit,
+      int64_t clean_page_bytes_limit);
+  ~BufferAllocator();
+
+  /// Allocate a buffer with a power-of-two length 'len'. This function may acquire
+  /// 'FreeBufferArena::lock_' and Page::lock so no locks lower in the lock acquisition
+  /// order (see buffer-pool-internal.h) should be held by the caller.
+  ///
+  /// Always succeeds on allocating memory up to 'system_bytes_limit', unless the system
+  /// is unable to give us 'system_bytes_limit' of memory or an internal bug: if all
+  /// clients write out enough dirty pages to stay within their reservation, then there
+  /// should always be enough free buffers and clean pages to reclaim.
+  Status Allocate(ClientHandle* client, int64_t len,
+      BufferPool::BufferHandle* buffer) WARN_UNUSED_RESULT;
+
+  /// Frees 'buffer', which must be open before calling. Closes 'buffer' and updates
+  /// internal state but does not release to any reservation.
+  void Free(BufferPool::BufferHandle&& buffer);
+
+  /// Adds a clean page 'page' to a clean page list. Caller must hold the page's
+  /// client's lock via 'client_lock' so that moving the page between the client list and
+  /// the free page list is atomic. Caller must not hold 'FreeBufferArena::lock_' or any
+  /// Page::lock.
+  void AddCleanPage(const boost::unique_lock& client_lock, Page* page);
+
+  /// Removes a clean page 'page' from a clean page list and returns true, if present in
+  /// one of the lists. Returns true if it was present. If 'claim_buffer' is true, the
+  /// caller must have reservation for the buffer, which is returned along with the page.
+  /// Otherwise the buffer is moved directly to the free buffer list. Caller must hold
+  /// the page's client's lock via 'client_lock' so that moving the page between the
+  /// client list and the free page list is atomic. Caller must not hold
+  /// 'FreeBufferArena::lock_' or any Page::lock.
+  bool RemoveCleanPage(
+      const boost::unique_lock& client_lock, bool claim_buffer, Page* page);
+
+  /// Periodically called to release free buffers back to the SystemAllocator. Releases
+  /// buffers based on recent allocation patterns, trying to minimise the number of
+  /// excess buffers retained in each list above the minimum required to avoid going
+  /// to the system allocator.
+  void Maintenance();
+
+  /// Try to release at least 'bytes_to_free' bytes of memory to the system allocator.
+  void ReleaseMemory(int64_t bytes_to_free);
+
+  int64_t system_bytes_limit() const { return system_bytes_limit_; }
+
+  /// Return the amount of memory currently allocated from the system.
+  int64_t GetSystemBytesAllocated() const {
+    return system_bytes_limit_ - system_bytes_remaining_.load();
+  }
+
+  /// Return the total number of free buffers in the allocator.
+  int64_t GetNumFreeBuffers() const;
+
+  /// Return the total bytes of free buffers in the allocator.
+  int64_t GetFreeBufferBytes() const;
+
+  /// Return the limit on bytes of clean pages in the allocator.
+  int64_t GetCleanPageBytesLimit() const;
+
+  /// Return the total number of clean pages in the allocator.
+  int64_t GetNumCleanPages() const;
+
+  /// Return the total bytes of clean pages in the allocator.
+  int64_t GetCleanPageBytes() const;
+
+  std::string DebugString();
+
+ protected:
+  friend class BufferAllocatorTest;
+  friend class BufferPoolTest;
+  friend class FreeBufferArena;
+
+  /// Test helper: gets the current size of the free list for buffers of 'len' bytes
+  /// on core 'core'.
+  int GetFreeListSize(int core, int64_t len);
+
+  /// Test helper: reduce the number of scavenge attempts so backend tests can force
+  /// use of the "locked" scavenging code path.
+  void set_max_scavenge_attempts(int val) {
+    DCHECK_GE(val, 1);
+    max_scavenge_attempts_ = val;
+  }
+
+ private:
+  /// Compute the maximum power-of-two buffer length that could be allocated based on the
+  /// amount of memory available 'system_bytes_limit'. The value is always at least
+  /// 'min_buffer_len' so that there is at least one valid buffer size.
+  static int64_t CalcMaxBufferLen(int64_t min_buffer_len, int64_t system_bytes_limit);
+
+  /// Same as Allocate() but leaves 'buffer->client_' NULL and does not update counters.
+  Status AllocateInternal(
+      int64_t len, BufferPool::BufferHandle* buffer) WARN_UNUSED_RESULT;
+
+  /// Tries to reclaim enough memory from various sources so that the caller can allocate
+  /// a buffer of 'target_bytes' from the system allocator. Scavenges buffers from the
+  /// free buffer and clean page lists of all cores and frees them with
+  /// 'system_allocator_'. Also tries to decrement 'system_bytes_remaining_'.
+  /// 'current_core' is the index of the current CPU core. Any bytes freed in excess of
+  /// 'target_bytes' are added to 'system_bytes_remaining_.' If 'slow_but_sure' is true,
+  /// this function uses a slower strategy that guarantees enough memory will be found
+  /// but can block progress of other threads for longer. If 'slow_but_sure' is false,
+  /// then this function optimistically tries to reclaim the memory but may not reclaim
+  /// 'target_bytes' of memory. Returns the number of bytes reclaimed.
+  int64_t ScavengeBuffers(bool slow_but_sure, int current_core, int64_t target_bytes);
+
+  /// Helper to free a list of buffers to the system. Returns the number of bytes freed.
+  int64_t FreeToSystem(std::vector&& buffers);
+
+  /// Compute a sum over all arenas. Does not lock the arenas.
+  int64_t SumOverArenas(std::function compute_fn) const;
+
+  /// The pool that this allocator is associated with.
+  BufferPool* const pool_;
+
+  /// System allocator that is ultimately used to allocate and free buffers.
+  const boost::scoped_ptr system_allocator_;
+
+  /// The minimum power-of-two buffer length that can be allocated.
+  const int64_t min_buffer_len_;
+
+  /// The maximum power-of-two buffer length that can be allocated. Always >=
+  /// 'min_buffer_len' so that there is at least one valid buffer size.
+  const int64_t max_buffer_len_;
+
+  /// The log2 of 'min_buffer_len_'.
+  const int log_min_buffer_len_;
+
+  /// The log2 of 'max_buffer_len_'.
+  const int log_max_buffer_len_;
+
+  /// The maximum physical memory in bytes that will be allocated from the system.
+  const int64_t system_bytes_limit_;
+
+  /// The remaining number of bytes of 'system_bytes_limit_' that can be used for
+  /// allocating new buffers. Must be updated atomically before a new buffer is
+  /// allocated or after an existing buffer is freed with the system allocator.
+  AtomicInt64 system_bytes_remaining_;
+
+  /// The maximum bytes of clean pages that can accumulate across all arenas before
+  /// they will be evicted.
+  const int64_t clean_page_bytes_limit_;
+
+  /// The number of bytes of 'clean_page_bytes_limit_' not used by clean pages. I.e.
+  /// (clean_page_bytes_limit - bytes of clean pages in the BufferAllocator).
+  /// 'clean_pages_bytes_limit_' is enforced by increasing this value before a
+  /// clean page is added and decreasing it after a clean page is reclaimed or evicted.
+  AtomicInt64 clean_page_bytes_remaining_;
+
+  /// Free and clean pages. One arena per core.
+  std::vector> per_core_arenas_;
+
+  /// Default number of times to attempt scavenging.
+  static const int MAX_SCAVENGE_ATTEMPTS = 3;
+
+  /// Number of times to attempt scavenging. Usually MAX_SCAVENGE_ATTEMPTS but can be
+  /// overridden by tests. The first max_scavenge_attempts_ - 1 attempts do not lock
+  /// all arenas so may fail. The final attempt locks all arenas, which is expensive
+  /// but is guaranteed to succeed.
+  int max_scavenge_attempts_;
+};
+}
+
+#endif
diff --git a/be/src/runtime/bufferpool/buffer_pool.cc b/be/src/runtime/bufferpool/buffer_pool.cc
new file mode 100644
index 0000000000..78829c9745
--- /dev/null
+++ b/be/src/runtime/bufferpool/buffer_pool.cc
@@ -0,0 +1,779 @@
+// Modifications copyright (C) 2017, Baidu.com, Inc.
+// Copyright 2017 The Apache Software Foundation
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "runtime/bufferpool/buffer_pool_internal.h"
+
+#include 
+#include 
+#include 
+
+#include "common/names.h"
+#include "gutil/strings/substitute.h"
+#include "runtime/bufferpool/buffer_allocator.h"
+#include "util/bit_util.h"
+#include "util/cpu_info.h"
+#include "util/runtime_profile.h"
+#include "util/time.h"
+#include "util/uid_util.h"
+#include "gutil/strings/substitute.h"
+
+//DEFINE_int32(concurrent_scratch_ios_per_device, 2,
+//    "Set this to influence the number of concurrent write I/Os issues to write data to "
+//    "scratch files. This is multiplied by the number of active scratch directories to "
+//    "obtain the target number of scratch write I/Os per query.");
+
+namespace palo {
+
+constexpr int BufferPool::LOG_MAX_BUFFER_BYTES;
+constexpr int64_t BufferPool::MAX_BUFFER_BYTES;
+
+void BufferPool::BufferHandle::Open(uint8_t* data, int64_t len, int home_core) {
+  DCHECK_LE(0, home_core);
+  DCHECK_LT(home_core, CpuInfo::get_max_num_cores());
+  client_ = nullptr;
+  data_ = data;
+  len_ = len;
+  home_core_ = home_core;
+}
+
+BufferPool::PageHandle::PageHandle() {
+  Reset();
+}
+
+BufferPool::PageHandle::PageHandle(PageHandle&& src) {
+  Reset();
+  *this = std::move(src);
+}
+
+BufferPool::PageHandle& BufferPool::PageHandle::operator=(PageHandle&& src) {
+  DCHECK(!is_open());
+  // Copy over all members then close src.
+  page_ = src.page_;
+  client_ = src.client_;
+  src.Reset();
+  return *this;
+}
+
+void BufferPool::PageHandle::Open(Page* page, ClientHandle* client) {
+  DCHECK(!is_open());
+  page_ = page;
+  client_ = client;
+}
+
+void BufferPool::PageHandle::Reset() {
+  page_ = NULL;
+  client_ = NULL;
+}
+
+int BufferPool::PageHandle::pin_count() const {
+  DCHECK(is_open());
+  // The pin count can only be modified via this PageHandle, which must not be
+  // concurrently accessed by multiple threads, so it is safe to access without locking
+  return page_->pin_count;
+}
+
+int64_t BufferPool::PageHandle::len() const {
+  DCHECK(is_open());
+  return page_->len; // Does not require locking.
+}
+
+Status BufferPool::PageHandle::GetBuffer(const BufferHandle** buffer) const {
+  DCHECK(is_open());
+  DCHECK(client_->is_registered());
+  DCHECK(is_pinned());
+/*
+  if (page_->pin_in_flight) {
+    // Finish the work started in Pin().
+    RETURN_IF_ERROR(client_->impl_->FinishMoveEvictedToPinned(page_));
+  }
+*/
+  DCHECK(!page_->pin_in_flight);
+  *buffer = &page_->buffer;
+  DCHECK((*buffer)->is_open());
+  return Status::OK;
+}
+
+BufferPool::BufferPool(int64_t min_buffer_len, int64_t buffer_bytes_limit,
+      int64_t clean_page_bytes_limit)
+  : allocator_(new BufferAllocator(
+        this, min_buffer_len, buffer_bytes_limit, clean_page_bytes_limit)),
+    min_buffer_len_(min_buffer_len) {
+  DCHECK_GT(min_buffer_len, 0);
+  DCHECK_EQ(min_buffer_len, BitUtil::RoundUpToPowerOfTwo(min_buffer_len));
+}
+
+BufferPool::~BufferPool() {}
+
+Status BufferPool::RegisterClient(const string& name, //TmpFileMgr::FileGroup* file_group,
+    ReservationTracker* parent_reservation, MemTracker* mem_tracker,
+    int64_t reservation_limit, RuntimeProfile* profile, ClientHandle* client) {
+  DCHECK(!client->is_registered());
+  DCHECK(parent_reservation != NULL);
+  client->impl_ = new Client(this, //file_group, 
+          name, parent_reservation, mem_tracker,
+      reservation_limit, profile);
+  return Status::OK;
+}
+
+void BufferPool::DeregisterClient(ClientHandle* client) {
+  if (!client->is_registered()) return;
+  client->impl_->Close(); // Will DCHECK if any remaining buffers or pinned pages.
+  delete client->impl_; // Will DCHECK if there are any remaining pages.
+  client->impl_ = NULL;
+}
+
+Status BufferPool::CreatePage(
+    ClientHandle* client, int64_t len, PageHandle* handle, const BufferHandle** buffer) {
+  DCHECK(!handle->is_open());
+  DCHECK_GE(len, min_buffer_len_);
+  DCHECK_EQ(len, BitUtil::RoundUpToPowerOfTwo(len));
+
+  BufferHandle new_buffer;
+  // No changes have been made to state yet, so we can cleanly return on error.
+  RETURN_IF_ERROR(AllocateBuffer(client, len, &new_buffer));
+  Page* page = client->impl_->CreatePinnedPage(move(new_buffer));
+  handle->Open(page, client);
+  if (buffer != nullptr) *buffer = &page->buffer;
+  return Status::OK;
+}
+
+void BufferPool::DestroyPage(ClientHandle* client, PageHandle* handle) {
+  if (!handle->is_open()) return; // DestroyPage() should be idempotent.
+
+  if (handle->is_pinned()) {
+    // Cancel the read I/O - we don't need the data any more.
+    //if (handle->page_->pin_in_flight) {
+    //  handle->page_->write_handle->CancelRead();
+    //  handle->page_->pin_in_flight = false;
+    //}
+    // In the pinned case, delegate to ExtractBuffer() and FreeBuffer() to do the work
+    // of cleaning up the page, freeing the buffer and updating reservations correctly.
+    BufferHandle buffer;
+    Status status = ExtractBuffer(client, handle, &buffer);
+    DCHECK(status.ok()) << status.get_error_msg();
+    FreeBuffer(client, &buffer);
+  } else {
+    // In the unpinned case, no reservations are used so we just clean up the page.
+    client->impl_->DestroyPageInternal(handle);
+  }
+}
+
+Status BufferPool::Pin(ClientHandle* client, PageHandle* handle) {
+  DCHECK(client->is_registered());
+  DCHECK(handle->is_open());
+  DCHECK_EQ(handle->client_, client);
+
+  Page* page = handle->page_;
+  if (page->pin_count == 0) {
+    RETURN_IF_ERROR(client->impl_->StartMoveToPinned(client, page));
+    COUNTER_UPDATE(client->impl_->counters().peak_unpinned_bytes, -page->len);
+  }
+  // Update accounting last to avoid complicating the error return path above.
+  ++page->pin_count;
+  client->impl_->reservation()->AllocateFrom(page->len);
+  return Status::OK;
+}
+
+void BufferPool::Unpin(ClientHandle* client, PageHandle* handle) {
+  DCHECK(handle->is_open());
+  DCHECK(client->is_registered());
+  DCHECK_EQ(handle->client_, client);
+  // If handle is pinned, we can assume that the page itself is pinned.
+  DCHECK(handle->is_pinned());
+  Page* page = handle->page_;
+  ReservationTracker* reservation = client->impl_->reservation();
+  reservation->ReleaseTo(page->len);
+
+  if (--page->pin_count > 0) return;
+  //if (page->pin_in_flight) {
+    // Data is not in memory - move it back to evicted.
+  //  client->impl_->UndoMoveEvictedToPinned(page);
+  //} else {
+    // Data is in memory - move it to dirty unpinned.
+  client->impl_->MoveToDirtyUnpinned(page);
+  //}
+  COUNTER_UPDATE(client->impl_->counters().peak_unpinned_bytes, handle->len());
+}
+
+Status BufferPool::ExtractBuffer(
+    ClientHandle* client, PageHandle* page_handle, BufferHandle* buffer_handle) {
+  DCHECK(page_handle->is_pinned());
+  DCHECK(!buffer_handle->is_open());
+  DCHECK_EQ(page_handle->client_, client);
+
+  // If an async pin is in flight, we need to wait for it.
+  const BufferHandle* dummy;
+  RETURN_IF_ERROR(page_handle->GetBuffer(&dummy));
+
+  // Bring the pin count to 1 so that we're not using surplus reservations.
+  while (page_handle->pin_count() > 1) Unpin(client, page_handle);
+
+  // Destroy the page and extract the buffer.
+  client->impl_->DestroyPageInternal(page_handle, buffer_handle);
+  DCHECK(buffer_handle->is_open());
+  return Status::OK;
+}
+
+Status BufferPool::AllocateBuffer(
+    ClientHandle* client, int64_t len, BufferHandle* handle) {
+  RETURN_IF_ERROR(client->impl_->PrepareToAllocateBuffer(len));
+  Status status = allocator_->Allocate(client, len, handle);
+  if (!status.ok()) {
+    // Allocation failed - update client's accounting to reflect the failure.
+    client->impl_->FreedBuffer(len);
+  }
+  return status;
+}
+
+void BufferPool::FreeBuffer(ClientHandle* client, BufferHandle* handle) {
+  if (!handle->is_open()) return; // Should be idempotent.
+  DCHECK_EQ(client, handle->client_);
+  int64_t len = handle->len_;
+  allocator_->Free(move(*handle));
+  client->impl_->FreedBuffer(len);
+}
+
+Status BufferPool::TransferBuffer(ClientHandle* src_client, BufferHandle* src,
+    ClientHandle* dst_client, BufferHandle* dst) {
+  DCHECK(src->is_open());
+  DCHECK(!dst->is_open());
+  DCHECK_EQ(src_client, src->client_);
+  DCHECK_NE(src, dst);
+  DCHECK_NE(src_client, dst_client);
+
+  dst_client->impl_->reservation()->AllocateFrom(src->len());
+  src_client->impl_->reservation()->ReleaseTo(src->len());
+  *dst = std::move(*src);
+  dst->client_ = dst_client;
+  return Status::OK;
+}
+
+void BufferPool::Maintenance() {
+  allocator_->Maintenance();
+}
+
+void BufferPool::ReleaseMemory(int64_t bytes_to_free) {
+  allocator_->ReleaseMemory(bytes_to_free);
+}
+
+int64_t BufferPool::GetSystemBytesLimit() const {
+  return allocator_->system_bytes_limit();
+}
+
+int64_t BufferPool::GetSystemBytesAllocated() const {
+  return allocator_->GetSystemBytesAllocated();
+}
+
+int64_t BufferPool::GetCleanPageBytesLimit() const {
+  return allocator_->GetCleanPageBytesLimit();
+}
+
+int64_t BufferPool::GetNumCleanPages() const {
+  return allocator_->GetNumCleanPages();
+}
+
+int64_t BufferPool::GetCleanPageBytes() const {
+  return allocator_->GetCleanPageBytes();
+}
+
+int64_t BufferPool::GetNumFreeBuffers() const {
+  return allocator_->GetNumFreeBuffers();
+}
+
+int64_t BufferPool::GetFreeBufferBytes() const {
+  return allocator_->GetFreeBufferBytes();
+}
+
+bool BufferPool::ClientHandle::IncreaseReservation(int64_t bytes) {
+  return impl_->reservation()->IncreaseReservation(bytes);
+}
+
+bool BufferPool::ClientHandle::IncreaseReservationToFit(int64_t bytes) {
+  return impl_->reservation()->IncreaseReservationToFit(bytes);
+}
+
+Status BufferPool::ClientHandle::DecreaseReservationTo(int64_t target_bytes) {
+  return impl_->DecreaseReservationTo(target_bytes);
+}
+
+int64_t BufferPool::ClientHandle::GetReservation() const {
+  return impl_->reservation()->GetReservation();
+}
+
+int64_t BufferPool::ClientHandle::GetUsedReservation() const {
+  return impl_->reservation()->GetUsedReservation();
+}
+
+int64_t BufferPool::ClientHandle::GetUnusedReservation() const {
+  return impl_->reservation()->GetUnusedReservation();
+}
+
+bool BufferPool::ClientHandle::TransferReservationFrom(
+    ReservationTracker* src, int64_t bytes) {
+  return src->TransferReservationTo(impl_->reservation(), bytes);
+}
+
+bool BufferPool::ClientHandle::TransferReservationTo(
+    ReservationTracker* dst, int64_t bytes) {
+  return impl_->reservation()->TransferReservationTo(dst, bytes);
+}
+
+void BufferPool::ClientHandle::SaveReservation(SubReservation* dst, int64_t bytes) {
+  DCHECK_EQ(dst->tracker_->parent(), impl_->reservation());
+  bool success = impl_->reservation()->TransferReservationTo(dst->tracker_.get(), bytes);
+  DCHECK(success); // SubReservation should not have a limit, so this shouldn't fail.
+}
+
+void BufferPool::ClientHandle::RestoreReservation(SubReservation* src, int64_t bytes) {
+  DCHECK_EQ(src->tracker_->parent(), impl_->reservation());
+  bool success = src->tracker_->TransferReservationTo(impl_->reservation(), bytes);
+  DCHECK(success); // Transferring reservation to parent shouldn't fail.
+}
+
+void BufferPool::ClientHandle::SetDebugDenyIncreaseReservation(double probability) {
+  impl_->reservation()->SetDebugDenyIncreaseReservation(probability);
+}
+
+bool BufferPool::ClientHandle::has_unpinned_pages() const {
+  return impl_->has_unpinned_pages();
+}
+
+BufferPool::SubReservation::SubReservation(ClientHandle* client) {
+  tracker_.reset(new ReservationTracker);
+  tracker_->InitChildTracker(
+      nullptr, client->impl_->reservation(), nullptr, numeric_limits::max());
+}
+
+BufferPool::SubReservation::~SubReservation() {}
+
+int64_t BufferPool::SubReservation::GetReservation() const {
+  return tracker_->GetReservation();
+}
+
+void BufferPool::SubReservation::Close() {
+  // Give any reservation back to the client.
+  if (is_closed()) return;
+  bool success =
+      tracker_->TransferReservationTo(tracker_->parent(), tracker_->GetReservation());
+  DCHECK(success); // Transferring reservation to parent shouldn't fail.
+  tracker_->Close();
+  tracker_.reset();
+}
+
+BufferPool::Client::Client(BufferPool* pool, //TmpFileMgr::FileGroup* file_group,
+    const string& name, ReservationTracker* parent_reservation, MemTracker* mem_tracker,
+    int64_t reservation_limit, RuntimeProfile* profile)
+  : pool_(pool),
+    //file_group_(file_group),
+    name_(name),
+    debug_write_delay_ms_(0),
+    num_pages_(0),
+    buffers_allocated_bytes_(0) {
+  // Set up a child profile with buffer pool info.
+  RuntimeProfile* child_profile = profile->create_child("Buffer pool", true, true);
+  reservation_.InitChildTracker(
+      child_profile, parent_reservation, mem_tracker, reservation_limit);
+  counters_.alloc_time = ADD_TIMER(child_profile, "AllocTime");
+  counters_.cumulative_allocations =
+      ADD_COUNTER(child_profile, "CumulativeAllocations", TUnit::UNIT);
+  counters_.cumulative_bytes_alloced =
+      ADD_COUNTER(child_profile, "CumulativeAllocationBytes", TUnit::BYTES);
+  counters_.read_wait_time = ADD_TIMER(child_profile, "ReadIoWaitTime");
+  counters_.read_io_ops = ADD_COUNTER(child_profile, "ReadIoOps", TUnit::UNIT);
+  counters_.bytes_read = ADD_COUNTER(child_profile, "ReadIoBytes", TUnit::BYTES);
+  counters_.write_wait_time = ADD_TIMER(child_profile, "WriteIoWaitTime");
+  counters_.write_io_ops = ADD_COUNTER(child_profile, "WriteIoOps", TUnit::UNIT);
+  counters_.bytes_written = ADD_COUNTER(child_profile, "WriteIoBytes", TUnit::BYTES);
+  counters_.peak_unpinned_bytes =
+      child_profile->AddHighWaterMarkCounter("PeakUnpinnedBytes", TUnit::BYTES);
+}
+
+BufferPool::Page* BufferPool::Client::CreatePinnedPage(BufferHandle&& buffer) {
+  Page* page = new Page(this, buffer.len());
+  page->buffer = move(buffer);
+  page->pin_count = 1;
+
+  boost::lock_guard lock(lock_);
+  // The buffer is transferred to the page so will be accounted for in
+  // pinned_pages_.bytes() instead of buffers_allocated_bytes_.
+  buffers_allocated_bytes_ -= page->len;
+  pinned_pages_.enqueue(page);
+  ++num_pages_;
+  DCHECK_CONSISTENCY();
+  return page;
+}
+
+
+void BufferPool::Client::DestroyPageInternal(
+    PageHandle* handle, BufferHandle* out_buffer) {
+  DCHECK(handle->is_pinned() || out_buffer == NULL);
+  Page* page = handle->page_;
+  // Remove the page from the list that it is currently present in (if any).
+  {
+    unique_lock cl(lock_);
+    // First try to remove from the pinned or dirty unpinned lists.
+    if (!pinned_pages_.remove(page) && !dirty_unpinned_pages_.remove(page)) {
+      // The page either has a write in flight, is clean, or is evicted.
+      // Let the write complete, if in flight.
+      //WaitForWrite(&cl, page);
+      // If clean, remove it from the clean pages list. If evicted, this is a no-op.
+      pool_->allocator_->RemoveCleanPage(cl, out_buffer != nullptr, page);
+    }
+    DCHECK(!page->in_queue());
+    --num_pages_;
+  }
+
+  //if (page->write_handle != NULL) {
+    // Discard any on-disk data.
+    //file_group_->DestroyWriteHandle(move(page->write_handle));
+  //}
+  //
+  if (out_buffer != NULL) {
+    DCHECK(page->buffer.is_open());
+    *out_buffer = std::move(page->buffer);
+    buffers_allocated_bytes_ += out_buffer->len();
+  } else if (page->buffer.is_open()) {
+    pool_->allocator_->Free(move(page->buffer));
+  }
+  delete page;
+  handle->Reset();
+}
+
+
+void BufferPool::Client::MoveToDirtyUnpinned(Page* page) {
+  // Only valid to unpin pages if spilling is enabled.
+  // DCHECK(spilling_enabled());
+  DCHECK_EQ(0, page->pin_count);
+
+  unique_lock lock(lock_);
+  DCHECK_CONSISTENCY();
+  DCHECK(pinned_pages_.contains(page));
+  pinned_pages_.remove(page);
+  dirty_unpinned_pages_.enqueue(page);
+
+  // Check if we should initiate writes for this (or another) dirty page.
+  //WriteDirtyPagesAsync();
+}
+
+Status BufferPool::Client::StartMoveToPinned(ClientHandle* client, Page* page) {
+  unique_lock cl(lock_);
+  DCHECK_CONSISTENCY();
+  // Propagate any write errors that occurred for this client.
+  //RETURN_IF_ERROR(write_status_i;
+
+  if (dirty_unpinned_pages_.remove(page)) {
+    // No writes were initiated for the page - just move it back to the pinned state.
+    pinned_pages_.enqueue(page);
+    return Status::OK;
+  }
+
+  return Status("start move to pinned error, page is not in dirty.");
+/*
+  if (in_flight_write_pages_.contains(page)) {
+    // A write is in flight. If so, wait for it to complete - then we only have to
+    // handle the pinned and evicted cases.
+    WaitForWrite(&cl, page);
+    RETURN_IF_ERROR(write_status_); // The write may have set 'write_status_'.
+  }
+
+  // At this point we need to either reclaim a clean page or allocate a new buffer.
+  // We may need to clean some pages to do so.
+  RETURN_IF_ERROR(CleanPages(&cl, page->len));
+  if (pool_->allocator_->RemoveCleanPage(cl, true, page)) {
+    // The clean page still has an associated buffer. Restore the data, and move the page
+    // back to the pinned state.
+    pinned_pages_.enqueue(page);
+    DCHECK(page->buffer.is_open());
+    DCHECK(page->write_handle != NULL);
+    // Don't need on-disk data.
+    cl.unlock(); // Don't block progress for other threads operating on other pages.
+    return file_group_->RestoreData(move(page->write_handle), page->buffer.mem_range());
+  }
+  // If the page wasn't in the clean pages list, it must have been evicted.
+  return StartMoveEvictedToPinned(&cl, client, page);
+*/
+}
+/*
+Status BufferPool::Client::StartMoveEvictedToPinned(
+    unique_lock* client_lock, ClientHandle* client, Page* page) {
+  DCHECK(!page->buffer.is_open());
+
+  // Safe to modify the page's buffer handle without holding the page lock because no
+  // concurrent operations can modify evicted pages.
+  BufferHandle buffer;
+  RETURN_IF_ERROR(pool_->allocator_->Allocate(client, page->len, &page->buffer));
+  COUNTER_ADD(counters().bytes_read, page->len);
+  COUNTER_ADD(counters().read_io_ops, 1);
+  RETURN_IF_ERROR(
+      file_group_->ReadAsync(page->write_handle.get(), page->buffer.mem_range()));
+  pinned_pages_.enqueue(page);
+  page->pin_in_flight = true;
+  DCHECK_CONSISTENCY();
+  return Status::OK;
+}
+
+void BufferPool::Client::UndoMoveEvictedToPinned(Page* page) {
+  // We need to get the page back to the evicted state where:
+  // * There is no in-flight read.
+  // * The page's data is on disk referenced by 'write_handle'
+  // * The page has no attached buffer.
+  DCHECK(page->pin_in_flight);
+  page->write_handle->CancelRead();
+  page->pin_in_flight = false;
+
+  unique_lock lock(lock_);
+  DCHECK_CONSISTENCY();
+  DCHECK(pinned_pages_.contains(page));
+  pinned_pages_.remove(page);
+  // Discard the buffer - the pin was in flight so there was no way that a valid
+  // reference to the buffer's contents was returned since the pin was still in flight.
+  pool_->allocator_->Free(move(page->buffer));
+}
+*/
+/*
+Status BufferPool::Client::FinishMoveEvictedToPinned(Page* page) {
+  DCHECK(page->pin_in_flight);
+  SCOPED_TIMER(counters().read_wait_time);
+  // Don't hold any locks while reading back the data. It is safe to modify the page's
+  // buffer handle without holding any locks because no concurrent operations can modify
+  // evicted pages.
+  RETURN_IF_ERROR(
+      file_group_->WaitForAsyncRead(page->write_handle.get(), page->buffer.mem_range()));
+  file_group_->DestroyWriteHandle(move(page->write_handle));
+  page->pin_in_flight = false;
+  return Status::OK;
+}
+*/
+Status BufferPool::Client::PrepareToAllocateBuffer(int64_t len) {
+  unique_lock lock(lock_);
+  // Clean enough pages to allow allocation to proceed without violating our eviction
+  // policy. This can fail, so only update the accounting once success is ensured.
+  //RETURN_IF_ERROR(CleanPages(&lock, len));
+  reservation_.AllocateFrom(len);
+  buffers_allocated_bytes_ += len;
+  DCHECK_CONSISTENCY();
+  return Status::OK;
+}
+
+Status BufferPool::Client::DecreaseReservationTo(int64_t target_bytes) {
+  unique_lock lock(lock_);
+  int64_t current_reservation = reservation_.GetReservation();
+  DCHECK_GE(current_reservation, target_bytes);
+  int64_t amount_to_free =
+      min(reservation_.GetUnusedReservation(), current_reservation - target_bytes);
+  if (amount_to_free == 0) return Status::OK;
+  // Clean enough pages to allow us to safely release reservation.
+  //RETURN_IF_ERROR(CleanPages(&lock, amount_to_free));
+  reservation_.DecreaseReservation(amount_to_free);
+  return Status::OK;
+}
+
+Status BufferPool::Client::CleanPages(unique_lock* client_lock, int64_t len) {
+  DCheckHoldsLock(*client_lock);
+  DCHECK_CONSISTENCY();
+  /*
+  // Work out what we need to get bytes of dirty unpinned + in flight pages down to
+  // in order to satisfy the eviction policy.
+  int64_t target_dirty_bytes = reservation_.GetReservation() - buffers_allocated_bytes_
+      - pinned_pages_.bytes() - len;
+  // Start enough writes to ensure that the loop condition below will eventually become
+  // false (or a write error will be encountered).
+  int64_t min_bytes_to_write =
+      max(0, dirty_unpinned_pages_.bytes() - target_dirty_bytes);
+  //WriteDirtyPagesAsync(min_bytes_to_write);
+
+  // One of the writes we initiated, or an earlier in-flight write may have hit an error.
+  RETURN_IF_ERROR(write_status_);
+
+  // Wait until enough writes have finished so that we can make the allocation without
+  // violating the eviction policy. I.e. so that other clients can immediately get the
+  // memory they're entitled to without waiting for this client's write to complete.
+  DCHECK_GE(in_flight_write_pages_.bytes(), min_bytes_to_write);
+  while (dirty_unpinned_pages_.bytes() + in_flight_write_pages_.bytes()
+      > target_dirty_bytes) {
+    SCOPED_TIMER(counters().write_wait_time);
+    write_complete_cv_.Wait(*client_lock);
+    RETURN_IF_ERROR(write_status_); // Check if error occurred while waiting.
+  }
+*/
+  return Status::OK;
+}
+/*
+void BufferPool::Client::WriteDirtyPagesAsync(int64_t min_bytes_to_write) {
+  DCHECK_GE(min_bytes_to_write, 0);
+  DCHECK_LE(min_bytes_to_write, dirty_unpinned_pages_.bytes());
+ // if (file_group_ == NULL) {
+    // Spilling disabled - there should be no unpinned pages to write.
+    DCHECK_EQ(0, min_bytes_to_write);
+    DCHECK_EQ(0, dirty_unpinned_pages_.bytes());
+    return;
+////  }
+
+  // No point in starting writes if an error occurred because future operations for the
+  // client will fail regardless.
+  if (!write_status_.ok()) return;
+
+  // Compute the ideal amount of writes to start. We use a simple heuristic based on the
+  // total number of writes. The FileGroup's allocation should spread the writes across
+  // disks somewhat, but doesn't guarantee we're fully using all available disks. In
+  // future we could track the # of writes per-disk.
+  const int64_t target_writes = FLAGS_concurrent_scratch_ios_per_device
+      * file_group_->tmp_file_mgr()->NumActiveTmpDevices();
+
+  int64_t bytes_written = 0;
+  while (!dirty_unpinned_pages_.empty()
+      && (bytes_written < min_bytes_to_write
+             || in_flight_write_pages_.size() < target_writes)) {
+    Page* page = dirty_unpinned_pages_.tail(); // LIFO.
+    DCHECK(page != NULL) << "Should have been enough dirty unpinned pages";
+    {
+      lock_guard pl(page->buffer_lock);
+      DCHECK(file_group_ != NULL);
+      DCHECK(page->buffer.is_open());
+      COUNTER_ADD(counters().bytes_written, page->len);
+      COUNTER_ADD(counters().write_io_ops, 1);
+      Status status = file_group_->Write(page->buffer.mem_range(),
+          [this, page](const Status& write_status) {
+            WriteCompleteCallback(page, write_status);
+          },
+          &page->write_handle);
+      // Exit early on error: there is no point in starting more writes because future
+      /// operations for this client will fail regardless.
+      if (!status.ok()) {
+        write_status_.MergeStatus(status);
+        return;
+      }
+    }
+    // Now that the write is in flight, update all the state
+    Page* tmp = dirty_unpinned_pages_.pop_back();
+    DCHECK_EQ(tmp, page);
+    in_flight_write_pages_.enqueue(page);
+    bytes_written += page->len;
+  } 
+}
+
+void BufferPool::Client::WriteCompleteCallback(Page* page, const Status& write_status) {
+#ifndef NDEBUG
+  if (debug_write_delay_ms_ > 0) SleepForMs(debug_write_delay_ms_);
+#endif
+  {
+    unique_lock cl(lock_);
+    DCHECK(in_flight_write_pages_.contains(page));
+    // The status should always be propagated.
+    // TODO: if we add cancellation support to TmpFileMgr, consider cancellation path.
+    if (!write_status.ok()) write_status_.MergeStatus(write_status);
+    in_flight_write_pages_.remove(page);
+    // Move to clean pages list even if an error was encountered - the buffer can be
+    // repurposed by other clients and 'write_status_' must be checked by this client
+    // before reading back the bad data.
+    pool_->allocator_->AddCleanPage(cl, page);
+    WriteDirtyPagesAsync(); // Start another asynchronous write if needed.
+
+    // Notify before releasing lock to avoid race with Page and Client destruction.
+    page->write_complete_cv_.NotifyAll();
+    write_complete_cv_.NotifyAll();
+  }
+}
+
+void BufferPool::Client::WaitForWrite(unique_lock* client_lock, Page* page) {
+  DCheckHoldsLock(*client_lock);
+  while (in_flight_write_pages_.contains(page)) {
+    SCOPED_TIMER(counters().write_wait_time);
+    page->write_complete_cv_.Wait(*client_lock);
+  }
+}
+
+void BufferPool::Client::WaitForAllWrites() {
+  unique_lock cl(lock_);
+  while (in_flight_write_pages_.size() > 0) {
+    write_complete_cv_.Wait(cl);
+  }
+}
+*/
+string BufferPool::Client::DebugString() {
+  lock_guard lock(lock_);
+  stringstream ss;
+  ss << " " << this << " name: " << name_ << " write_status: "
+     << write_status_.get_error_msg() << " buffers allocated " << buffers_allocated_bytes_
+     << " num_pages: " << num_pages_ << " pinned_bytes: " << pinned_pages_.bytes()
+     << " dirty_unpinned_bytes: " << dirty_unpinned_pages_.bytes() << " in_flight_write_bytes: "
+     << in_flight_write_pages_.bytes() << " reservation: " << reservation_.DebugString();
+  ss << "\n  " << pinned_pages_.size() << " pinned pages: ";
+  pinned_pages_.iterate(bind(Page::DebugStringCallback, &ss, _1));
+  ss << "\n  " << dirty_unpinned_pages_.size() << " dirty unpinned pages: ";
+  dirty_unpinned_pages_.iterate(bind(Page::DebugStringCallback, &ss, _1));
+  ss << "\n  " << in_flight_write_pages_.size() << " in flight write pages: ";
+  in_flight_write_pages_.iterate(bind(Page::DebugStringCallback, &ss, _1));
+  return ss.str();
+}
+
+string BufferPool::ClientHandle::DebugString() const {
+  std::stringstream ss;
+  if (is_registered()) {
+    ss << " " << this << " internal state: {"
+               << impl_->DebugString() << "}";
+    return ss.str();
+  } else {
+    ss << " " << this << " UNREGISTERED"; 
+    return ss.str();
+  }
+}
+/*
+string BufferPool::PageHandle::DebugString() const {
+  if (is_open()) {
+    lock_guard pl(page_->buffer_lock);
+    return Substitute(" $0 client: $1/$2 page: {$3}", this,
+        client_, client_->impl_, page_->DebugString());
+  } else {
+    return Substitute(" $0 CLOSED", this);
+  }
+}
+*/
+string BufferPool::Page::DebugString() {
+  std::stringstream ss;
+  ss << " " << this << " len: " 
+     << len << " pin_count:" << pin_count << " buf:" << buffer.DebugString();
+  return ss.str();
+}
+
+bool BufferPool::Page::DebugStringCallback(stringstream* ss, BufferPool::Page* page) {
+  lock_guard pl(page->buffer_lock);
+  (*ss) << page->DebugString() << "\n";
+  return true;
+}
+
+string BufferPool::BufferHandle::DebugString() const {
+  std::stringstream ss;  
+  if (is_open()) {
+    ss << " " << this << " client: " << client_ 
+       << "/" << client_->impl_ << " data: " << data_ << " len: " << len_;
+  } else {
+    ss << " " << this << " CLOSED";
+  }
+  return ss.str();
+}
+
+string BufferPool::DebugString() {
+  stringstream ss;
+  ss << " " << this << " min_buffer_len: " << min_buffer_len_ << "\n"
+     << allocator_->DebugString();
+  return ss.str();
+}
+}
diff --git a/be/src/runtime/bufferpool/buffer_pool.h b/be/src/runtime/bufferpool/buffer_pool.h
new file mode 100644
index 0000000000..0c7654919e
--- /dev/null
+++ b/be/src/runtime/bufferpool/buffer_pool.h
@@ -0,0 +1,554 @@
+// Modifications copyright (C) 2017, Baidu.com, Inc.
+// Copyright 2017 The Apache Software Foundation
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef BDG_PALO_BE_RUNTIME_BUFFER_POOL_H
+#define BDG_PALO_BE_RUNTIME_BUFFER_POOL_H
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "common/atomic.h"
+#include "common/compiler_util.h"
+#include "common/object_pool.h"
+#include "common/status.h"
+#include "gutil/macros.h"
+#include "gutil/dynamic_annotations.h"
+//#include "runtime/tmp_file_mgr.h"
+#include "util/aligned_new.h"
+#include "util/internal_queue.h"
+#include "util/mem_range.h"
+#include "util/spinlock.h"
+
+namespace palo {
+
+class ReservationTracker;
+class RuntimeProfile;
+class SystemAllocator;
+class MemTracker;
+
+/// A buffer pool that manages memory buffers for all queries in an Impala daemon.
+/// The buffer pool enforces buffer reservations, limits, and implements policies
+/// for moving spilled memory from in-memory buffers to disk. It also enables reuse of
+/// buffers between queries, to avoid frequent allocations.
+///
+/// The buffer pool can be used for allocating any large buffers (above a configurable
+/// minimum length), whether or not the buffers will be spilled. Smaller allocations
+/// are not serviced directly by the buffer pool: clients of the buffer pool must
+/// subdivide buffers if they wish to use smaller allocations.
+///
+/// All buffer pool operations are in the context of a registered buffer pool client.
+/// A buffer pool client should be created for every allocator of buffers at the level
+/// of granularity required for reporting and enforcement of reservations, e.g. an
+/// operator. The client tracks buffer reservations via its ReservationTracker and also
+/// includes info that is helpful for debugging (e.g. the operator that is associated
+/// with the buffer). Unless otherwise noted, it is not safe to invoke concurrent buffer
+/// pool operations for the same client.
+///
+/// Pages, Buffers and Pinning
+/// ==========================
+/// * A page is a logical block of memory that can reside in memory or on disk.
+/// * A buffer is a physical block of memory that can hold a page in memory.
+/// * A page handle is used by buffer pool clients to identify and access a page and
+///   the corresponding buffer. Clients do not interact with pages directly.
+/// * A buffer handle is used by buffer pool clients to identify and access a buffer.
+/// * A page is pinned if it has pin count > 0. A pinned page stays mapped to the same
+///   buffer.
+/// * An unpinned page can be written out to disk by the buffer pool so that the buffer
+///   can be used for another purpose.
+///
+/// Buffer/Page Sizes
+/// =================
+/// The buffer pool has a minimum buffer size, which must be a power-of-two. Page and
+/// buffer sizes must be an exact power-of-two multiple of the minimum buffer size.
+///
+/// Reservations
+/// ============
+/// Before allocating buffers or pinning pages, a client must reserve memory through its
+/// ReservationTracker. Reservation of n bytes give a client the right to allocate
+/// buffers or pin pages summing up to n bytes. Reservations are both necessary and
+/// sufficient for a client to allocate buffers or pin pages: the operations succeed
+/// unless a "system error" such as a disk write error is encountered that prevents
+/// unpinned pages from being written to disk.
+///
+/// More memory may be reserved than is used, e.g. if a client is not using its full
+/// reservation. In such cases, the buffer pool can use the free buffers in any way,
+/// e.g. for keeping unpinned pages in memory, so long as it is able to fulfill the
+/// reservations when needed, e.g. by flushing unpinned pages to disk.
+///
+/// Page/Buffer Handles
+/// ===================
+/// The buffer pool exposes PageHandles and BufferHandles, which are owned by clients of
+/// the buffer pool, and act as a proxy for the internal data structure representing the
+/// page or buffer in the buffer pool. Handles are "open" if they are associated with a
+/// page or buffer. An open PageHandle is obtained by creating a page. PageHandles are
+/// closed by calling BufferPool::DestroyPage(). An open BufferHandle is obtained by
+/// allocating a buffer or extracting a BufferHandle from a PageHandle. The buffer of a
+/// pinned page can also be accessed through the PageHandle. The handle destructors check
+/// for resource leaks, e.g. an open handle that would result in a buffer leak.
+///
+/// Pin Counting of Page Handles:
+/// ----------------------------------
+/// Page handles are scoped to a client. The invariants are as follows:
+/// * A page can only be accessed through an open handle.
+/// * A page is destroyed once the handle is destroyed via DestroyPage().
+/// * A page's buffer can only be accessed through a pinned handle.
+/// * Pin() can be called on an open handle, incrementing the handle's pin count.
+/// * Unpin() can be called on a pinned handle, but not an unpinned handle.
+/// * Pin() always increases usage of reservations, and Unpin() always decreases usage,
+///   i.e. the handle consumes  *  bytes of reservation.
+///
+/// Example Usage: Buffers
+/// ==================================
+/// The simplest use case is to allocate a memory buffer.
+/// * The new buffer is created with AllocateBuffer().
+/// * The client reads and writes to the buffer as it sees fit.
+/// * If the client is done with the buffer's contents it can call FreeBuffer() to
+///   destroy the handle and free the buffer, or use TransferBuffer() to transfer
+///   the buffer to a different client.
+///
+/// Example Usage: Spillable Pages
+/// ==============================
+/// * In order to spill pages to disk, the Client must be registered with a FileGroup,
+///   which is used to allocate scratch space on disk.
+/// * A spilling operator creates a new page with CreatePage().
+/// * The client reads and writes to the page's buffer as it sees fit.
+/// * If the operator encounters memory pressure, it can decrease reservation usage by
+///   calling Unpin() on the page. The page may then be written to disk and its buffer
+///   repurposed internally by BufferPool.
+/// * Once the operator needs the page's contents again and has sufficient unused
+///   reservation, it can call Pin(), which brings the page's contents back into memory,
+///   perhaps in a different buffer. Therefore the operator must fix up any pointers into
+///   the previous buffer. Pin() executes asynchronously - the caller only blocks waiting
+///   for read I/O if it calls GetBuffer() or ExtractBuffer() while the read is in
+///   flight.
+/// * If the operator is done with the page, it can call DestroyPage() to destroy the
+///   handle and release resources, or call ExtractBuffer() to extract the buffer.
+///
+/// Synchronization
+/// ===============
+/// The data structures in the buffer pool itself are thread-safe. Client-owned data
+/// structures - Client, PageHandle and BufferHandle - are not protected from concurrent
+/// accesses. Clients must ensure that they do not invoke concurrent operations with the
+/// same Client, PageHandle or BufferHandle.
+class BufferPool : public CacheLineAligned {
+ public:
+  class BufferAllocator;
+  class BufferHandle;
+  class ClientHandle;
+  class PageHandle;
+  class SubReservation;
+  /// Constructs a new buffer pool.
+  /// 'min_buffer_len': the minimum buffer length for the pool. Must be a power of two.
+  /// 'buffer_bytes_limit': the maximum physical memory in bytes that can be used by the
+  ///     buffer pool. If 'buffer_bytes_limit' is not a multiple of 'min_buffer_len', the
+  ///     remainder will not be usable.
+  /// 'clean_page_bytes_limit': the maximum bytes of clean pages that will be retained by the
+  ///     buffer pool.
+  BufferPool(int64_t min_buffer_len, int64_t buffer_bytes_limit,
+      int64_t clean_page_bytes_limit);
+  ~BufferPool();
+
+  /// Register a client. Returns an error status and does not register the client if the
+  /// arguments are invalid. 'name' is an arbitrary name used to identify the client in
+  /// any errors messages or logging. If 'file_group' is non-NULL, it is used to allocate
+  /// scratch space to write unpinned pages to disk. If it is NULL, unpinning of pages is
+  /// not allowed for this client. Counters for this client are added to the (non-NULL)
+  /// 'profile'. 'client' is the client to register. 'client' must not already be
+  /// registered.
+  ///
+  /// The client's reservation is created as a child of 'parent_reservation' with limit
+  /// 'reservation_limit' and associated with MemTracker 'mem_tracker'. The initial
+  /// reservation is 0 bytes.
+  Status RegisterClient(const std::string& name, //TmpFileMgr::FileGroup* file_group,
+      ReservationTracker* parent_reservation, MemTracker* mem_tracker,
+      int64_t reservation_limit, RuntimeProfile* profile,
+      ClientHandle* client) WARN_UNUSED_RESULT;
+
+  /// Deregister 'client' if it is registered. All pages must be destroyed and buffers
+  /// must be freed for the client before calling this. Releases any reservation that
+  /// belongs to the client. Idempotent.
+  void DeregisterClient(ClientHandle* client);
+
+  /// Create a new page of 'len' bytes with pin count 1. 'len' must be a page length
+  /// supported by BufferPool (see BufferPool class comment). The client must have
+  /// sufficient unused reservation to pin the new page (otherwise it will DCHECK).
+  /// CreatePage() only fails when a system error prevents the buffer pool from fulfilling
+  /// the reservation.
+  /// On success, the handle is mapped to the new page and 'buffer', if non-NULL, is set
+  /// to the page's buffer.
+  Status CreatePage(ClientHandle* client, int64_t len, PageHandle* handle,
+      const BufferHandle** buffer = nullptr) WARN_UNUSED_RESULT;
+
+  /// Increment the pin count of 'handle'. After Pin() the underlying page will
+  /// be mapped to a buffer, which will be accessible through 'handle'. If the data
+  /// was evicted from memory, it will be read back into memory asynchronously.
+  /// Attempting to access the buffer with ExtractBuffer() or handle.GetBuffer() will
+  /// block until the data is in memory. The caller is responsible for ensuring it has
+  /// enough unused reservation before calling Pin() (otherwise it will DCHECK). Pin()
+  /// only fails when a system error prevents the buffer pool from fulfilling the
+  /// reservation or if an I/O error is encountered reading back data from disk.
+  /// 'handle' must be open.
+  Status Pin(ClientHandle* client, PageHandle* handle) WARN_UNUSED_RESULT;
+
+  /// Decrement the pin count of 'handle'. Decrease client's reservation usage. If the
+  /// handle's pin count becomes zero, it is no longer valid for the underlying page's
+  /// buffer to be accessed via 'handle'. If the page's total pin count across all
+  /// handles that reference it goes to zero, the page's data may be written to disk and
+  /// the buffer reclaimed. 'handle' must be open and have a pin count > 0.
+  ///
+  /// It is an error to reduce the pin count to 0 if 'client' does not have an associated
+  /// FileGroup.
+  void Unpin(ClientHandle* client, PageHandle* handle);
+
+  /// Destroy the page referenced by 'handle' (if 'handle' is open). Any buffers or disk
+  /// storage backing the page are freed. Idempotent. If the page is pinned, the
+  /// reservation usage is decreased accordingly.
+  void DestroyPage(ClientHandle* client, PageHandle* handle);
+
+  /// Extracts buffer from a pinned page. After this returns, the page referenced by
+  /// 'page_handle' will be destroyed and 'buffer_handle' will reference the buffer from
+  /// 'page_handle'. This may decrease reservation usage of 'client' if the page was
+  /// pinned multiple times via 'page_handle'. May return an error if 'page_handle' was
+  /// unpinned earlier with no subsequent GetBuffer() call and a read error is
+  /// encountered while bringing the page back into memory.
+  Status ExtractBuffer(
+      ClientHandle* client, PageHandle* page_handle, BufferHandle* buffer_handle) WARN_UNUSED_RESULT;
+
+  /// Allocates a new buffer of 'len' bytes. Uses reservation from 'client'. The caller
+  /// is responsible for ensuring it has enough unused reservation before calling
+  /// AllocateBuffer() (otherwise it will DCHECK). AllocateBuffer() only fails when
+  /// a system error prevents the buffer pool from fulfilling the reservation.
+  Status AllocateBuffer(
+      ClientHandle* client, int64_t len, BufferHandle* handle) WARN_UNUSED_RESULT;
+
+  /// If 'handle' is open, close 'handle', free the buffer and decrease the reservation
+  /// usage from 'client'. Idempotent. Safe to call concurrently with any other
+  /// operations for 'client'.
+  void FreeBuffer(ClientHandle* client, BufferHandle* handle);
+
+  /// Transfer ownership of buffer from 'src_client' to 'dst_client' and move the
+  /// handle from 'src' to 'dst'. Increases reservation usage in 'dst_client' and
+  /// decreases reservation usage in 'src_client'. 'src' must be open and 'dst' must be
+  /// closed before calling. 'src'/'dst' and 'src_client'/'dst_client' must be different.
+  /// After a successful call, 'src' is closed and 'dst' is open. Safe to call
+  /// concurrently with any other operations for 'src_client'.
+  Status TransferBuffer(ClientHandle* src_client, BufferHandle* src,
+      ClientHandle* dst_client, BufferHandle* dst) WARN_UNUSED_RESULT;
+
+  /// Try to release at least 'bytes_to_free' bytes of memory to the system allocator.
+  /// TODO: once IMPALA-4834 is done and all large allocations are served from the buffer
+  /// pool, this may not be necessary.
+  void ReleaseMemory(int64_t bytes_to_free);
+
+  /// Called periodically by a maintenance thread to release unused memory back to the
+  /// system allocator.
+  void Maintenance();
+
+  /// Print a debug string with the state of the buffer pool.
+  std::string DebugString();
+
+  int64_t min_buffer_len() const { return min_buffer_len_; }
+  int64_t GetSystemBytesLimit() const;
+  int64_t GetSystemBytesAllocated() const;
+
+  /// Return the limit on bytes of clean pages in the pool.
+  int64_t GetCleanPageBytesLimit() const;
+
+  /// Return the total number of clean pages in the pool.
+  int64_t GetNumCleanPages() const;
+
+  /// Return the total bytes of clean pages in the pool.
+  int64_t GetCleanPageBytes() const;
+
+  /// Return the total number of free buffers in the pool.
+  int64_t GetNumFreeBuffers() const;
+
+  /// Return the total bytes of free buffers in the pool.
+  int64_t GetFreeBufferBytes() const;
+
+  /// Generous upper bounds on page and buffer size and the number of different
+  /// power-of-two buffer sizes.
+  static constexpr int LOG_MAX_BUFFER_BYTES = 48;
+  static constexpr int64_t MAX_BUFFER_BYTES = 1L << LOG_MAX_BUFFER_BYTES;
+
+ protected:
+  friend class BufferPoolTest;
+  /// Test helper: get a reference to the allocator.
+  BufferAllocator* allocator() { return allocator_.get(); }
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(BufferPool);
+  class Client;
+  class FreeBufferArena;
+  class PageList;
+  struct Page;
+
+  /// Allocator for allocating and freeing all buffer memory and managing lists of free
+  /// buffers and clean pages.
+  boost::scoped_ptr allocator_;
+
+  /// The minimum length of a buffer in bytes. All buffers and pages are a power-of-two
+  /// multiple of this length. This is always a power of two.
+  const int64_t min_buffer_len_;
+};
+
+/// External representation of a client of the BufferPool. Clients are used for
+/// reservation accounting, and will be used in the future for tracking per-client
+/// buffer pool counters. This class is the external handle for a client so
+/// each Client instance is owned by the BufferPool's client, rather than the BufferPool.
+/// Each Client should only be used by a single thread at a time: concurrently calling
+/// Client methods or BufferPool methods with the Client as an argument is not supported.
+class BufferPool::ClientHandle {
+ public:
+  ClientHandle() : impl_(NULL) {}
+  /// Client must be deregistered.
+  ~ClientHandle() { DCHECK(!is_registered()); }
+
+  /// Request to increase reservation for this client by 'bytes' by calling
+  /// ReservationTracker::IncreaseReservation(). Returns true if the reservation was
+  /// successfully increased.
+  bool IncreaseReservation(int64_t bytes) WARN_UNUSED_RESULT;
+
+  /// Tries to ensure that 'bytes' of unused reservation is available for this client
+  /// to use by calling ReservationTracker::IncreaseReservationToFit(). Returns true
+  /// if successful, after which 'bytes' can be used.
+  bool IncreaseReservationToFit(int64_t bytes) WARN_UNUSED_RESULT;
+
+  /// Try to decrease this client's reservation down to a minimum of 'target_bytes' by
+  /// releasing unused reservation to ancestor ReservationTrackers, all the way up to
+  /// the root of the ReservationTracker tree. May block waiting for unpinned pages to
+  /// be flushed. This client's reservation must be at least 'target_bytes' before
+  /// calling this method. May fail if decreasing the reservation requires flushing
+  /// unpinned pages to disk and a write to disk fails.
+  Status DecreaseReservationTo(int64_t target_bytes) WARN_UNUSED_RESULT;
+
+  /// Move some of this client's reservation to the SubReservation. 'bytes' of unused
+  /// reservation must be available in this tracker.
+  void SaveReservation(SubReservation* dst, int64_t bytes);
+
+  /// Move some of src's reservation to this client. 'bytes' of unused reservation must be
+  /// available in 'src'.
+  void RestoreReservation(SubReservation* src, int64_t bytes);
+
+  /// Accessors for this client's reservation corresponding to the identically-named
+  /// methods in ReservationTracker.
+  int64_t GetReservation() const;
+  int64_t GetUsedReservation() const;
+  int64_t GetUnusedReservation() const;
+
+  /// Try to transfer 'bytes' of reservation from 'src' to this client using
+  /// ReservationTracker::TransferReservationTo().
+  bool TransferReservationFrom(ReservationTracker* src, int64_t bytes);
+
+  /// Transfer 'bytes' of reservation from this client to 'dst' using
+  /// ReservationTracker::TransferReservationTo().
+  bool TransferReservationTo(ReservationTracker* dst, int64_t bytes);
+
+  /// Call SetDebugDenyIncreaseReservation() on this client's ReservationTracker.
+  void SetDebugDenyIncreaseReservation(double probability);
+
+  bool is_registered() const { return impl_ != NULL; }
+
+  /// Return true if there are any unpinned pages for this client.
+  bool has_unpinned_pages() const;
+
+  std::string DebugString() const;
+
+ private:
+  friend class BufferPool;
+  friend class BufferPoolTest;
+  friend class SubReservation;
+  DISALLOW_COPY_AND_ASSIGN(ClientHandle);
+
+  /// Internal state for the client. NULL means the client isn't registered.
+  /// Owned by BufferPool.
+  Client* impl_;
+};
+
+/// Helper class that allows dividing up a client's reservation into separate buckets.
+class BufferPool::SubReservation {
+ public:
+  SubReservation(ClientHandle* client);
+  ~SubReservation();
+
+  /// Returns the amount of reservation stored in this sub-reservation.
+  int64_t GetReservation() const;
+
+  /// Releases the sub-reservation to the client's tracker. Must be called before
+  /// destruction.
+  void Close();
+
+  bool is_closed() const { return tracker_ == nullptr; }
+
+ private:
+  friend class BufferPool::ClientHandle;
+  DISALLOW_COPY_AND_ASSIGN(SubReservation);
+
+  /// Child of the client's tracker used to track the sub-reservation. Usage is not
+  /// tracked against this tracker - instead the reservation is always transferred back
+  /// to the client's tracker before use.
+  boost::scoped_ptr tracker_;
+};
+
+/// A handle to a buffer allocated from the buffer pool. Each BufferHandle should only
+/// be used by a single thread at a time: concurrently calling BufferHandle methods or
+/// BufferPool methods with the BufferHandle as an argument is not supported.
+class BufferPool::BufferHandle {
+ public:
+  BufferHandle() { Reset(); }
+  ~BufferHandle() { DCHECK(!is_open()); }
+
+  /// Allow move construction of handles to support std::move(). Inline to make moving
+  /// efficient.
+  inline BufferHandle(BufferHandle&& src);
+
+  /// Allow move assignment of handles to support STL classes like std::vector.
+  /// Destination must be uninitialized. Inline to make moving efficient.
+  inline BufferHandle& operator=(BufferHandle&& src);
+
+  bool is_open() const { return data_ != NULL; }
+  int64_t len() const {
+    DCHECK(is_open());
+    return len_;
+  }
+  /// Get a pointer to the start of the buffer.
+  uint8_t* data() const {
+    DCHECK(is_open());
+    return data_;
+  }
+
+  MemRange mem_range() const { return MemRange(data(), len()); }
+
+  std::string DebugString() const;
+
+  /// Poison the memory associated with this handle. If ASAN is not enabled, this is a
+  /// no-op.
+  void Poison() { ASAN_POISON_MEMORY_REGION(data(), len()); }
+
+  /// Unpoison the memory associated with this handle. If ASAN is not enabled, this is a
+  /// no-op.
+  void Unpoison() { ASAN_UNPOISON_MEMORY_REGION(data(), len()); }
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(BufferHandle);
+  friend class BufferPool;
+  friend class SystemAllocator;
+
+  /// Internal helper to set the handle to an opened state.
+  void Open(uint8_t* data, int64_t len, int home_core);
+
+  /// Internal helper to reset the handle to an unopened state. Inlined to make moving
+  /// efficient.
+  inline void Reset();
+
+  /// The client the buffer handle belongs to, used to validate that the correct client
+  /// is provided in BufferPool method calls. Set to NULL if the buffer is in a free list.
+  const ClientHandle* client_;
+
+  /// Pointer to the start of the buffer. Non-NULL if open, NULL if closed.
+  uint8_t* data_;
+
+  /// Length of the buffer in bytes.
+  int64_t len_;
+
+  /// The CPU core that the buffer was allocated from - used to determine which arena
+  /// it will be added to.
+  int home_core_;
+};
+
+/// The handle for a page used by clients of the BufferPool. Each PageHandle should
+/// only be used by a single thread at a time: concurrently calling PageHandle methods
+/// or BufferPool methods with the PageHandle as an argument is not supported.
+class BufferPool::PageHandle {
+ public:
+  PageHandle();
+  ~PageHandle() { DCHECK(!is_open()); }
+
+  // Allow move construction of page handles, to support std::move().
+  PageHandle(PageHandle&& src);
+
+  // Allow move assignment of page handles, to support STL classes like std::vector.
+  // Destination must be closed.
+  PageHandle& operator=(PageHandle&& src);
+
+  bool is_open() const { return page_ != NULL; }
+  bool is_pinned() const { return pin_count() > 0; }
+  int pin_count() const;
+  int64_t len() const;
+
+  /// Get a reference to the page's buffer handle. Only valid to call if the page is
+  /// pinned. If the page was previously unpinned and the read I/O for the data is still
+  /// in flight, this can block waiting. Returns an error if an error was encountered
+  /// reading the data back, which can only happen if Unpin() was called on the page
+  /// since the last call to GetBuffer(). Only const accessors of the returned handle can
+  /// be used: it is invalid to call FreeBuffer() or TransferBuffer() on it or to
+  /// otherwise modify the handle.
+  Status GetBuffer(const BufferHandle** buffer_handle) const WARN_UNUSED_RESULT;
+
+  std::string DebugString() const;
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(PageHandle);
+  friend class BufferPool;
+  friend class BufferPoolTest;
+  friend class Page;
+
+  /// Internal helper to open the handle for the given page.
+  void Open(Page* page, ClientHandle* client);
+
+  /// Internal helper to reset the handle to an unopened state.
+  void Reset();
+
+  /// The internal page structure. NULL if the handle is not open.
+  Page* page_;
+
+  /// The client the page handle belongs to.
+  ClientHandle* client_;
+};
+
+inline BufferPool::BufferHandle::BufferHandle(BufferHandle&& src) {
+  Reset();
+  *this = std::move(src);
+}
+
+inline BufferPool::BufferHandle& BufferPool::BufferHandle::operator=(
+    BufferHandle&& src) {
+  DCHECK(!is_open());
+  // Copy over all members then close src.
+  client_ = src.client_;
+  data_ = src.data_;
+  len_ = src.len_;
+  home_core_ = src.home_core_;
+  src.Reset();
+  return *this;
+}
+
+inline void BufferPool::BufferHandle::Reset() {
+  client_ = NULL;
+  data_ = NULL;
+  len_ = -1;
+  home_core_ = -1;
+}
+}
+
+#endif
diff --git a/be/src/runtime/bufferpool/buffer_pool_counters.h b/be/src/runtime/bufferpool/buffer_pool_counters.h
new file mode 100644
index 0000000000..39e7d81ffa
--- /dev/null
+++ b/be/src/runtime/bufferpool/buffer_pool_counters.h
@@ -0,0 +1,64 @@
+// Modifications copyright (C) 2017, Baidu.com, Inc.
+// Copyright 2017 The Apache Software Foundation
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef BDG_PALO_BE_RUNTIME_BUFFER_POOL_COUNTERS_H
+#define BDG_PALO_BE_RUNTIME_BUFFER_POOL_COUNTERS_H
+
+#include "util/runtime_profile.h"
+
+namespace palo {
+
+/// A set of counters for each buffer pool client.
+struct BufferPoolClientCounters {
+ public:
+  /// Total amount of time spent inside BufferAllocator::AllocateBuffer().
+  RuntimeProfile::Counter* alloc_time;
+
+  /// Number of buffers allocated via BufferAllocator::AllocateBuffer().
+  RuntimeProfile::Counter* cumulative_allocations;
+
+  /// Bytes of buffers allocated via BufferAllocator::AllocateBuffer().
+  RuntimeProfile::Counter* cumulative_bytes_alloced;
+
+  /// Amount of time spent waiting for reads from disk to complete.
+  RuntimeProfile::Counter* read_wait_time;
+
+  /// Total number of read I/O operations issued.
+  RuntimeProfile::Counter* read_io_ops;
+
+  /// Total bytes read from disk.
+  RuntimeProfile::Counter* bytes_read;
+
+  /// Amount of time spent waiting for writes to disk to complete.
+  RuntimeProfile::Counter* write_wait_time;
+
+  /// Total number of write I/O operations issued.
+  RuntimeProfile::Counter* write_io_ops;
+
+  /// Total bytes written to disk.
+  RuntimeProfile::Counter* bytes_written;
+
+  /// The peak total size of unpinned pages.
+  RuntimeProfile::HighWaterMarkCounter* peak_unpinned_bytes;
+};
+
+}
+
+#endif
diff --git a/be/src/runtime/bufferpool/buffer_pool_internal.h b/be/src/runtime/bufferpool/buffer_pool_internal.h
new file mode 100644
index 0000000000..c1231dda33
--- /dev/null
+++ b/be/src/runtime/bufferpool/buffer_pool_internal.h
@@ -0,0 +1,390 @@
+// Modifications copyright (C) 2017, Baidu.com, Inc.
+// Copyright 2017 The Apache Software Foundation
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This file includes definitions of classes used internally in the buffer pool.
+//
+/// +========================+
+/// | IMPLEMENTATION NOTES   |
+/// +========================+
+///
+/// Lock Ordering
+/// =============
+/// The lock acquisition order is:
+/// 1. Client::lock_
+/// 2. FreeBufferArena::lock_. If multiple arena locks are acquired, must be acquired in
+///    ascending order.
+/// 3. Page::lock
+///
+/// If a reference to a Page is acquired through a page list, the Page* reference only
+/// remains valid so long as list's lock is held.
+///
+/// Page States
+/// ===========
+/// Each Page object is owned by at most one InternalList at any given point.
+/// Each page is either pinned or unpinned. Unpinned has a number of sub-states, which
+/// is determined by which list in Client/BufferPool contains the page.
+/// * Pinned: Always in this state when 'pin_count' > 0. The page has a buffer and is in
+///     Client::pinned_pages_. 'pin_in_flight' determines which sub-state the page is in:
+///   -> When pin_in_flight=false, the buffer contains the page's data and the client can
+///      read and write to the buffer.
+///   -> When pin_in_flight=true, the page's data is in the process of being read from
+///      scratch disk into the buffer. Clients will block on the read I/O if they attempt
+///      to access the buffer.
+/// * Unpinned - Dirty: When no write to scratch has been started for an unpinned page.
+///     The page is in Client::dirty_unpinned_pages_.
+/// * Unpinned - Write in flight: When the write to scratch has been started but not
+///     completed for a dirty unpinned page. The page is in
+///     Client::write_in_flight_pages_. For accounting purposes this is considered a
+///     dirty page.
+/// * Unpinned - Clean: When the write to scratch has completed but the page was not
+///     evicted. The page is in a clean pages list in a BufferAllocator arena.
+/// * Unpinned - Evicted: After a clean page's buffer has been reclaimed. The page is
+///     not in any list.
+///
+/// Page Eviction Policy
+/// ====================
+/// The page eviction policy is designed so that clients that run only in-memory (i.e.
+/// don't unpin pages) never block on I/O. To achieve this, we must be able to
+/// fulfil reservations by either allocating buffers or evicting clean pages. Assuming
+/// reservations are not overcommitted (they shouldn't be), this global invariant can be
+/// maintained by enforcing a local invariant for every client:
+///
+///   reservation >= BufferHandles returned to client
+//                   + pinned pages + dirty pages (dirty unpinned or write in flight)
+///
+/// The local invariant is maintained by writing pages to disk as the first step of any
+/// operation that allocates a new buffer or reclaims buffers from clean pages. I.e.
+/// "dirty pages" must be decreased before one of the other values on the R.H.S. of the
+/// invariant can be increased. Operations block waiting for enough writes to complete
+/// to satisfy the invariant.
+
+#ifndef BDG_PALO_BE_RUNTIME_BUFFER_POOL_INTERNAL_H
+#define BDG_PALO_BE_RUNTIME_BUFFER_POOL_INTERNAL_H
+
+#include 
+#include 
+
+#include 
+
+#include "runtime/bufferpool/buffer_pool_counters.h"
+#include "runtime/bufferpool/buffer_pool.h"
+#include "runtime/bufferpool/reservation_tracker.h"
+#include "util/condition_variable.h"
+
+// Ensure that DCheckConsistency() function calls get removed in release builds.
+#ifndef NDEBUG
+#define DCHECK_CONSISTENCY() DCheckConsistency()
+#else
+#define DCHECK_CONSISTENCY()
+#endif
+
+namespace palo {
+
+/// The internal representation of a page, which can be pinned or unpinned. See the
+/// class comment for explanation of the different page states.
+class BufferPool::Page : public InternalList::Node {
+
+public:
+
+  Page(Client* client, int64_t len)
+    : client(client), len(len), pin_count(0), pin_in_flight(false) {}
+  
+  std::string DebugString();
+
+  // Helper for BufferPool::DebugString().
+  static bool DebugStringCallback(std::stringstream* ss, BufferPool::Page* page);
+
+  /// The client that the page belongs to.
+  Client* const client;
+
+  /// The length of the page in bytes.
+  const int64_t len;
+
+  /// The pin count of the page. Only accessed in contexts that are passed the associated
+  /// PageHandle, so it cannot be accessed by multiple threads concurrently.
+  int pin_count;
+
+  /// True if the read I/O to pin the page was started but not completed. Only accessed
+  /// in contexts that are passed the associated PageHandle, so it cannot be accessed
+  /// by multiple threads concurrently.
+  bool pin_in_flight;
+
+  /// Non-null if there is a write in flight, the page is clean, or the page is evicted.
+  //std::unique_ptr write_handle;
+
+  /// Condition variable signalled when a write for this page completes. Protected by
+  /// client->lock_.
+  ConditionVariable write_complete_cv_;
+
+  /// This lock must be held when accessing 'buffer' if the page is unpinned and not
+  /// evicted (i.e. it is safe to access 'buffer' if the page is pinned or evicted).
+  SpinLock buffer_lock;
+
+  /// Buffer with the page's contents. Closed only iff page is evicted. Open otherwise.
+  BufferHandle buffer;
+};
+
+/// Wrapper around InternalList that tracks the # of bytes in the list.
+class BufferPool::PageList {
+ public:
+  PageList() : bytes_(0) {}
+  ~PageList() {
+    // Clients always empty out their list before destruction.
+    DCHECK(list_.empty());
+    DCHECK_EQ(0, bytes_);
+  }
+
+  void enqueue(Page* page) {
+    list_.enqueue(page);
+    bytes_ += page->len;
+  }
+
+  bool remove(Page* page) {
+    if (list_.remove(page)) {
+      bytes_ -= page->len;
+      return true;
+    }
+    return false;
+  }
+
+  Page* dequeue() {
+    Page* page = list_.dequeue();
+    if (page != nullptr) {
+      bytes_ -= page->len;
+    }
+    return page;
+  }
+
+  Page* pop_back() {
+    Page* page = list_.pop_back();
+    if (page != nullptr) {
+      bytes_ -= page->len;
+    }
+    return page;
+  }
+
+  void iterate(boost::function fn) { list_.iterate(fn); }
+  bool contains(Page* page) { return list_.contains(page); }
+  Page* tail() { return list_.tail(); }
+  bool empty() const { return list_.empty(); }
+  int size() const { return list_.size(); }
+  int64_t bytes() const { return bytes_; }
+
+  void DCheckConsistency() {
+    DCHECK_GE(bytes_, 0);
+    DCHECK_EQ(list_.empty(), bytes_ == 0);
+  }
+
+ private:
+  InternalList list_;
+  int64_t bytes_;
+};
+
+/// The internal state for the client.
+class BufferPool::Client {
+ public:
+  Client(BufferPool* pool, //TmpFileMgr::FileGroup* file_group, 
+     const std::string& name,
+      ReservationTracker* parent_reservation, MemTracker* mem_tracker,
+      int64_t reservation_limit, RuntimeProfile* profile);
+
+  ~Client() {
+    DCHECK_EQ(0, num_pages_);
+    DCHECK_EQ(0, buffers_allocated_bytes_);
+  }
+
+  /// Release reservation for this client.
+  void Close() { reservation_.Close(); }
+
+  /// Create a pinned page using 'buffer', which was allocated using AllocateBuffer().
+  /// No client or page locks should be held by the caller.
+  Page* CreatePinnedPage(BufferHandle&& buffer);
+
+  /// Reset 'handle', clean up references to handle->page and release any resources
+  /// associated with handle->page. If the page is pinned, 'out_buffer' can be passed in
+  /// and the page's buffer will be returned.
+  /// Neither the client's lock nor handle->page_->buffer_lock should be held by the
+  /// caller.
+  void DestroyPageInternal(PageHandle* handle, BufferHandle* out_buffer = NULL);
+
+  /// Updates client state to reflect that 'page' is now a dirty unpinned page. May
+  /// initiate writes for this or other dirty unpinned pages.
+  /// Neither the client's lock nor page->buffer_lock should be held by the caller.
+  void MoveToDirtyUnpinned(Page* page);
+
+  /// Move an unpinned page to the pinned state, moving between data structures and
+  /// reading from disk if necessary. Ensures the page has a buffer. If the data is
+  /// already in memory, ensures the data is in the page's buffer. If the data is on
+  /// disk, starts an async read of the data and sets 'pin_in_flight' on the page to
+  /// true. Neither the client's lock nor page->buffer_lock should be held by the caller.
+  Status StartMoveToPinned(ClientHandle* client, Page* page) WARN_UNUSED_RESULT;
+
+  /// Moves a page that has a pin in flight back to the evicted state, undoing
+  /// StartMoveToPinned(). Neither the client's lock nor page->buffer_lock should be held
+  /// by the caller.
+  //void UndoMoveEvictedToPinned(Page* page);
+
+  /// Finish the work of bring the data of an evicted page to memory if
+  /// page->pin_in_flight was set to true by StartMoveToPinned().
+  //Status FinishMoveEvictedToPinned(Page* page) WARN_UNUSED_RESULT;
+
+  /// Must be called once before allocating a buffer of 'len' via the AllocateBuffer()
+  /// API to deduct from the client's reservation and update internal accounting. Cleans
+  /// dirty pages if needed to satisfy the buffer pool's internal invariants. No page or
+  /// client locks should be held by the caller.
+  Status PrepareToAllocateBuffer(int64_t len) WARN_UNUSED_RESULT;
+
+  /// Implementation of ClientHandle::DecreaseReservationTo().
+  Status DecreaseReservationTo(int64_t target_bytes) WARN_UNUSED_RESULT;
+
+  /// Called after a buffer of 'len' is freed via the FreeBuffer() API to update
+  /// internal accounting and release the buffer to the client's reservation. No page or
+  /// client locks should be held by the caller.
+  void FreedBuffer(int64_t len) {
+    boost::lock_guard cl(lock_);
+    reservation_.ReleaseTo(len);
+    buffers_allocated_bytes_ -= len;
+    DCHECK_CONSISTENCY();
+  }
+
+  /// Wait for the in-flight write for 'page' to complete.
+  /// 'lock_' must be held by the caller via 'client_lock'. page->buffer_lock should
+  /// not be held.
+  //void WaitForWrite(boost::unique_lock* client_lock, Page* page);
+
+  /// Test helper: wait for all in-flight writes to complete.
+  /// 'lock_' must not be held by the caller.
+  //void WaitForAllWrites();
+
+  /// Asserts that 'client_lock' is holding 'lock_'.
+  void DCheckHoldsLock(const boost::unique_lock& client_lock) {
+    DCHECK(client_lock.mutex() == &lock_ && client_lock.owns_lock());
+  }
+
+  ReservationTracker* reservation() { return &reservation_; }
+  const BufferPoolClientCounters& counters() const { return counters_; }
+  //bool spilling_enabled() const { return file_group_ != NULL; }
+  void set_debug_write_delay_ms(int val) { debug_write_delay_ms_ = val; }
+  bool has_unpinned_pages() const {
+    // Safe to read without lock since other threads should not be calling BufferPool
+    // functions that create, destroy or unpin pages.
+    return pinned_pages_.size() < num_pages_;
+  }
+
+  std::string DebugString();
+
+ private:
+  // Check consistency of client, DCHECK if inconsistent. 'lock_' must be held.
+  void DCheckConsistency() {
+    DCHECK_GE(buffers_allocated_bytes_, 0);
+    pinned_pages_.DCheckConsistency();
+    dirty_unpinned_pages_.DCheckConsistency();
+    in_flight_write_pages_.DCheckConsistency();
+    DCHECK_LE(pinned_pages_.size() + dirty_unpinned_pages_.size()
+            + in_flight_write_pages_.size(),
+        num_pages_);
+    // Check that we flushed enough pages to disk given our eviction policy.
+    DCHECK_GE(reservation_.GetReservation(), buffers_allocated_bytes_
+            + pinned_pages_.bytes() + dirty_unpinned_pages_.bytes()
+            + in_flight_write_pages_.bytes());
+  }
+
+  /// Must be called once before allocating or reclaiming a buffer of 'len'. Ensures that
+  /// enough dirty pages are flushed to disk to satisfy the buffer pool's internal
+  /// invariants after the allocation. 'lock_' should be held by the caller via
+  /// 'client_lock'
+  Status CleanPages(boost::unique_lock* client_lock, int64_t len);
+
+  /// Initiates asynchronous writes of dirty unpinned pages to disk. Ensures that at
+  /// least 'min_bytes_to_write' bytes of writes will be written asynchronously. May
+  /// start writes more aggressively so that I/O and compute can be overlapped. If
+  /// any errors are encountered, 'write_status_' is set. 'write_status_' must therefore
+  /// be checked before reading back any pages. 'lock_' must be held by the caller.
+  //void WriteDirtyPagesAsync(int64_t min_bytes_to_write = 0);
+
+  /// Called when a write for 'page' completes.
+  //void WriteCompleteCallback(Page* page, const Status& write_status);
+
+  /// Move an evicted page to the pinned state by allocating a new buffer, starting an
+  /// async read from disk and moving the page to 'pinned_pages_'. client->impl must be
+  /// locked by the caller via 'client_lock' and handle->page must be unlocked.
+  /// 'client_lock' is released then reacquired.
+  //Status StartMoveEvictedToPinned(
+  //    boost::unique_lock* client_lock, ClientHandle* client, Page* page);
+
+  /// The buffer pool that owns the client.
+  BufferPool* const pool_;
+
+  /// The file group that should be used for allocating scratch space. If NULL, spilling
+  /// is disabled.
+  //TmpFileMgr::FileGroup* const file_group_;
+
+  /// A name identifying the client.
+  const std::string name_;
+
+  /// The reservation tracker for the client. All pages pinned by the client count as
+  /// usage against 'reservation_'.
+  ReservationTracker reservation_;
+
+  /// The RuntimeProfile counters for this client, owned by the client's RuntimeProfile.
+  /// All non-NULL.
+  BufferPoolClientCounters counters_;
+
+  /// Debug option to delay write completion.
+  int debug_write_delay_ms_;
+
+  /// Lock to protect the below member variables;
+  boost::mutex lock_;
+
+  /// Condition variable signalled when a write for this client completes.
+  ConditionVariable write_complete_cv_;
+
+  /// All non-OK statuses returned by write operations are merged into this status.
+  /// All operations that depend on pages being written to disk successfully (e.g.
+  /// reading pages back from disk) must check 'write_status_' before proceeding, so
+  /// that write errors that occurred asynchronously are correctly propagated. The
+  /// write error is global to the client so can be propagated to any Status-returning
+  /// operation for the client (even for operations on different Pages or Buffers).
+  /// Write errors are not recoverable so it is best to propagate them as quickly
+  /// as possible, instead of waiting to propagate them in a specific way.
+  Status write_status_;
+
+  /// Total number of pages for this client. Used for debugging and enforcing that all
+  /// pages are destroyed before the client.
+  int64_t num_pages_;
+
+  /// Total bytes of buffers in BufferHandles returned to clients (i.e. obtained from
+  /// AllocateBuffer() or ExtractBuffer()).
+  int64_t buffers_allocated_bytes_;
+
+  /// All pinned pages for this client.
+  PageList pinned_pages_;
+
+  /// Dirty unpinned pages for this client for which writes are not in flight. Page
+  /// writes are started in LIFO order, because operators typically have sequential access
+  /// patterns where the most recently evicted page will be last to be read.
+  PageList dirty_unpinned_pages_;
+
+  /// Dirty unpinned pages for this client for which writes are in flight.
+  PageList in_flight_write_pages_;
+};
+}
+
+#endif
diff --git a/be/src/runtime/bufferpool/free_list.h b/be/src/runtime/bufferpool/free_list.h
new file mode 100644
index 0000000000..b6b3d7c484
--- /dev/null
+++ b/be/src/runtime/bufferpool/free_list.h
@@ -0,0 +1,123 @@
+// Modifications copyright (C) 2017, Baidu.com, Inc.
+// Copyright 2017 The Apache Software Foundation
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef BDG_PALO_BE_RUNTIME_BUFFERPOOL_FREE_LIST_H
+#define BDG_PALO_BE_RUNTIME_BUFFERPOOL_FREE_LIST_H
+
+#include 
+#include 
+#include 
+
+#include 
+
+#include "common/logging.h"
+#include "gutil/macros.h"
+#include "runtime/bufferpool/buffer_pool.h"
+
+namespace palo {
+
+using BufferHandle = BufferPool::BufferHandle;
+
+/// A non-threadsafe list of free buffers.
+///
+/// Buffers are allocated by the caller and can be added to the list for later retrieval
+/// with AddFreeBuffer(). If the list is non-empty, calling PopFreeBuffer() will return
+/// one of the buffers previously added to the list. FreeList is agnostic about the size
+/// or other properties of the buffers added to it.
+///
+/// Buffers in the list can be freed at any point, e.g. if the list is storing too many
+/// free buffers (according to some policy). The caller is responsible for implementing
+/// the policy and calling FreeBuffers() or FreeAll() at the appropriate times.
+///
+/// Address space fragmentation
+/// ---------------------------
+/// To reduce memory fragmentation, the free list hands out buffers with lower memory
+/// addresses first and frees buffers with higher memory address first. If buffers were
+/// handed out by a policy that didn't take memory address into account, over time the
+/// distribution of free buffers within the address space would become essentially
+/// random. If free buffers were then unmapped, there would be many holes in the virtual
+/// memory map, which can cause difficulties for the OS in some cases, e.g. exceeding the
+/// maximum number of mmapped() regions (vm.max_map_count) in Linux. Using this approach
+/// will tend to consolidate free buffers in higher parts of the address space, allowing
+/// coalescing of the holes in most cases.
+class FreeList {
+ public:
+  FreeList() {}
+
+  /// Gets a free buffer. If the list is non-empty, returns true and sets 'buffer' to
+  /// one of the buffers previously added with AddFreeBuffer(). Otherwise returns false.
+  bool PopFreeBuffer(BufferHandle* buffer) {
+    if (free_list_.empty()) return false;
+    std::pop_heap(free_list_.begin(), free_list_.end(), HeapCompare);
+    *buffer = std::move(free_list_.back());
+    free_list_.pop_back();
+    return true;
+  }
+
+  /// Adds a free buffer to the list.
+  void AddFreeBuffer(BufferHandle&& buffer) {
+    buffer.Poison();
+    free_list_.emplace_back(std::move(buffer));
+    std::push_heap(free_list_.begin(), free_list_.end(), HeapCompare);
+  }
+
+  /// Get the 'num_buffers' buffers with the highest memory address from the list to
+  /// free. The average time complexity is n log n, where n is the current size of the
+  /// list.
+  std::vector GetBuffersToFree(int64_t num_buffers) {
+    std::vector buffers;
+    DCHECK_LE(num_buffers, free_list_.size());
+    // Sort the list so we can free the buffers with higher memory addresses.
+    // Note that the sorted list is still a valid min-heap.
+    std::sort(free_list_.begin(), free_list_.end(), SortCompare);
+
+    for (int64_t i = 0; i < num_buffers; ++i) {
+      buffers.emplace_back(std::move(free_list_.back()));
+      free_list_.pop_back();
+    }
+    return buffers;
+  }
+
+  /// Returns the number of buffers currently in the list.
+  int64_t Size() const { return free_list_.size(); }
+
+ private:
+  friend class FreeListTest;
+
+  DISALLOW_COPY_AND_ASSIGN(FreeList);
+
+  /// Compare function that orders by memory address.
+  inline static bool SortCompare(const BufferHandle& b1, const BufferHandle& b2) {
+    return b1.data() < b2.data();
+  }
+
+  /// Compare function that orders by memory address. Needs to be inverse of SortCompare()
+  /// because C++ provides a max-heap.
+  inline static bool HeapCompare(const BufferHandle& b1, const BufferHandle& b2) {
+    return SortCompare(b2, b1);
+  }
+
+  /// List of free memory buffers. Maintained as a min-heap ordered by the memory address
+  /// of the buffer.
+  std::vector free_list_;
+};
+}
+
+#endif
diff --git a/be/src/runtime/bufferpool/reservation_tracker.cc b/be/src/runtime/bufferpool/reservation_tracker.cc
new file mode 100644
index 0000000000..c191aa3552
--- /dev/null
+++ b/be/src/runtime/bufferpool/reservation_tracker.cc
@@ -0,0 +1,419 @@
+// Modifications copyright (C) 2017, Baidu.com, Inc.
+// Copyright 2017 The Apache Software Foundation
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "runtime/bufferpool/reservation_tracker.h"
+
+#include 
+#include 
+
+#include "common/object_pool.h"
+#include "gutil/strings/substitute.h"
+#include "runtime/mem_tracker.h"
+#include "util/dummy_runtime_profile.h"
+#include "util/runtime_profile.h"
+
+#include "common/names.h"
+#include "olap/utils.h"
+
+namespace palo {
+
+ReservationTracker::ReservationTracker() {}
+
+ReservationTracker::~ReservationTracker() {
+  DCHECK(!initialized_);
+}
+
+void ReservationTracker::InitRootTracker(
+    RuntimeProfile* profile, int64_t reservation_limit) {
+  lock_guard l(lock_);
+  DCHECK(!initialized_);
+  parent_ = nullptr;
+  mem_tracker_ = nullptr;
+  reservation_limit_ = reservation_limit;
+  reservation_ = 0;
+  used_reservation_ = 0;
+  child_reservations_ = 0;
+  initialized_ = true;
+
+  InitCounters(profile, reservation_limit_);
+  COUNTER_SET(counters_.peak_reservation, reservation_);
+
+  CheckConsistency();
+}
+
+void ReservationTracker::InitChildTracker(RuntimeProfile* profile,
+    ReservationTracker* parent, MemTracker* mem_tracker, int64_t reservation_limit) {
+  DCHECK(parent != nullptr);
+  DCHECK_GE(reservation_limit, 0);
+
+  lock_guard l(lock_);
+  DCHECK(!initialized_);
+  parent_ = parent;
+  mem_tracker_ = mem_tracker;
+
+  reservation_limit_ = reservation_limit;
+  reservation_ = 0;
+  used_reservation_ = 0;
+  child_reservations_ = 0;
+  initialized_ = true;
+
+  if (mem_tracker_ != nullptr) {
+    MemTracker* parent_mem_tracker = GetParentMemTracker();
+    if (parent_mem_tracker != nullptr) {
+      // Make sure the parent links of the MemTrackers correspond to our parent links.
+      DCHECK_EQ(parent_mem_tracker, mem_tracker_->parent());
+      // Make sure we don't have a lower limit than the ancestor, since we don't enforce
+      // limits at lower links.
+      DCHECK_EQ(mem_tracker_->lowest_limit(), parent_mem_tracker->lowest_limit());
+    } else {
+      // Make sure we didn't leave a gap in the links. E.g. this tracker's grandparent
+      // shouldn't have a MemTracker.
+      ReservationTracker* ancestor = parent_;
+      while (ancestor != nullptr) {
+        DCHECK(ancestor->mem_tracker_ == nullptr);
+        ancestor = ancestor->parent_;
+      }
+    }
+  }
+
+  InitCounters(profile, reservation_limit_);
+
+  CheckConsistency();
+}
+
+void ReservationTracker::InitCounters(
+    RuntimeProfile* profile, int64_t reservation_limit) {
+  if (profile == nullptr) {
+    dummy_profile_.reset(new DummyProfile);
+    profile = dummy_profile_->profile();
+  }
+
+  // Check that another tracker's counters aren't already registered in the profile.
+  DCHECK(profile->get_counter("PeakReservation") == nullptr);
+  counters_.peak_reservation =
+      profile->AddHighWaterMarkCounter("PeakReservation", TUnit::BYTES);
+  counters_.peak_used_reservation =
+      profile->AddHighWaterMarkCounter("PeakUsedReservation", TUnit::BYTES);
+  // Only show the limit if set.
+  counters_.reservation_limit = nullptr;
+  if (reservation_limit != numeric_limits::max()) {
+    counters_.reservation_limit = ADD_COUNTER(profile, "ReservationLimit", TUnit::BYTES);
+    COUNTER_SET(counters_.reservation_limit, reservation_limit);
+  }
+  if (mem_tracker_ != nullptr) mem_tracker_->enable_reservation_reporting(counters_);
+}
+
+void ReservationTracker::Close() {
+  lock_guard l(lock_);
+  if (!initialized_) return;
+  CheckConsistency();
+  DCHECK_EQ(used_reservation_, 0);
+  DCHECK_EQ(child_reservations_, 0);
+  // Release any reservation to parent.
+  if (parent_ != nullptr) DecreaseReservationLocked(reservation_, false);
+  mem_tracker_ = nullptr;
+  parent_ = nullptr;
+  initialized_ = false;
+}
+
+bool ReservationTracker::IncreaseReservation(int64_t bytes) {
+  lock_guard l(lock_);
+  return IncreaseReservationInternalLocked(bytes, false, false);
+}
+
+bool ReservationTracker::IncreaseReservationToFit(int64_t bytes) {
+  lock_guard l(lock_);
+  return IncreaseReservationInternalLocked(bytes, true, false);
+}
+
+bool ReservationTracker::IncreaseReservationInternalLocked(
+    int64_t bytes, bool use_existing_reservation, bool is_child_reservation) {
+  DCHECK(initialized_);
+  int64_t reservation_increase =
+      use_existing_reservation ? max(0, bytes - unused_reservation()) : bytes;
+  DCHECK_GE(reservation_increase, 0);
+
+  bool granted;
+  // Check if the increase is allowed, starting at the bottom of hierarchy.
+  if (reservation_increase == 0) {
+    granted = true;
+  } else if (increase_deny_probability_ != 0.0
+      && rand() < increase_deny_probability_ * (RAND_MAX + 1L)) {
+    // Randomly deny reservation if requested. Use rand() to avoid needing to set up a RNG.
+    // Should be good enough. If the probability is 0.0, this never triggers. If it is 1.0
+    // it always triggers.
+    granted = false;
+  } else if (reservation_ + reservation_increase > reservation_limit_) {
+    granted = false;
+  } else {
+    if (parent_ == nullptr) {
+      granted = true;
+    } else {
+      lock_guard l(parent_->lock_);
+      granted =
+          parent_->IncreaseReservationInternalLocked(reservation_increase, true, true);
+    }
+    if (granted && !TryConsumeFromMemTracker(reservation_increase)) {
+      granted = false;
+      // Roll back changes to ancestors if MemTracker update fails.
+      parent_->DecreaseReservation(reservation_increase, true);
+    }
+  }
+
+  if (granted) {
+    // The reservation was granted and state updated in all ancestors: we can modify
+    // this tracker's state now.
+    UpdateReservation(reservation_increase);
+    if (is_child_reservation) child_reservations_ += bytes;
+  }
+
+  CheckConsistency();
+  return granted;
+}
+
+bool ReservationTracker::TryConsumeFromMemTracker(int64_t reservation_increase) {
+  DCHECK_GE(reservation_increase, 0);
+  if (mem_tracker_ == nullptr) return true;
+  if (GetParentMemTracker() == nullptr) {
+    // At the topmost link, which may be a MemTracker with a limit, we need to use
+    // TryConsume() to check the limit.
+    return mem_tracker_->try_consume(reservation_increase);
+  } else {
+    // For lower links, there shouldn't be a limit to enforce, so we just need to
+    // update the consumption of the linked MemTracker since the reservation is
+    // already reflected in its parent.
+    mem_tracker_->consume_local(reservation_increase, GetParentMemTracker());
+    return true;
+  }
+}
+
+void ReservationTracker::ReleaseToMemTracker(int64_t reservation_decrease) {
+  DCHECK_GE(reservation_decrease, 0);
+  if (mem_tracker_ == nullptr) return;
+  if (GetParentMemTracker() == nullptr) {
+    mem_tracker_->release(reservation_decrease);
+  } else {
+    mem_tracker_->release_local(reservation_decrease, GetParentMemTracker());
+  }
+}
+
+void ReservationTracker::DecreaseReservation(int64_t bytes, bool is_child_reservation) {
+  lock_guard l(lock_);
+  DecreaseReservationLocked(bytes, is_child_reservation);
+}
+
+void ReservationTracker::DecreaseReservationLocked(
+    int64_t bytes, bool is_child_reservation) {
+  DCHECK(initialized_);
+  DCHECK_GE(reservation_, bytes);
+  if (bytes == 0) return;
+  if (is_child_reservation) child_reservations_ -= bytes;
+  UpdateReservation(-bytes);
+  ReleaseToMemTracker(bytes);
+  // The reservation should be returned up the tree.
+  if (parent_ != nullptr) parent_->DecreaseReservation(bytes, true);
+  CheckConsistency();
+}
+
+bool ReservationTracker::TransferReservationTo(ReservationTracker* other, int64_t bytes) {
+  if (other == this) return true;
+  // Find the path to the root from both. The root is guaranteed to be a common ancestor.
+  vector path_to_common = FindPathToRoot();
+  vector other_path_to_common = other->FindPathToRoot();
+  DCHECK_EQ(path_to_common.back(), other_path_to_common.back());
+  ReservationTracker* common_ancestor = path_to_common.back();
+  // Remove any common ancestors - they do not need to be updated for this transfer.
+  while (!path_to_common.empty() && !other_path_to_common.empty()
+      && path_to_common.back() == other_path_to_common.back()) {
+    common_ancestor = path_to_common.back();
+    path_to_common.pop_back();
+    other_path_to_common.pop_back();
+  }
+
+  // At this point, we have three cases:
+  // 1. 'common_ancestor' == 'other'. 'other_path_to_common' is empty because 'other' is
+  //    the lowest common ancestor. To transfer, we decrease the reservation on the
+  //    trackers under 'other', down to 'this'.
+  // 2. 'common_ancestor' == 'this'. 'path_to_common' is empty because 'this' is the
+  //    lowest common ancestor. To transfer, we increase the reservation on the trackers
+  //    under 'this', down to 'other'.
+  // 3. Neither is an ancestor of the other. Both 'other_path_to_common' and
+  //    'path_to_common' are non-empty. We increase the reservation on trackers from
+  //    'other' up to one below the common ancestor (checking limits as needed) and if
+  //    successful, decrease reservations on trackers from 'this' up to one below the
+  //    common ancestor.
+
+  // Lock all of the trackers so we can do the update atomically. Need to be careful to
+  // lock subtrees in the correct order.
+  vector> locks;
+  bool lock_first = path_to_common.empty() || other_path_to_common.empty()
+      || lock_sibling_subtree_first(path_to_common.back(), other_path_to_common.back());
+  if (lock_first) {
+    for (ReservationTracker* tracker : path_to_common) locks.emplace_back(tracker->lock_);
+  }
+  for (ReservationTracker* tracker : other_path_to_common) {
+    locks.emplace_back(tracker->lock_);
+  }
+  if (!lock_first) {
+    for (ReservationTracker* tracker : path_to_common) locks.emplace_back(tracker->lock_);
+  }
+
+  // Check reservation limits will not be violated before applying any updates.
+  for (ReservationTracker* tracker : other_path_to_common) {
+    if (tracker->reservation_ + bytes > tracker->reservation_limit_) return false;
+  }
+
+  // Do the updates now that we have checked the limits. We're holding all the locks
+  // so this is all atomic.
+  for (ReservationTracker* tracker : other_path_to_common) {
+    tracker->UpdateReservation(bytes);
+    // We don't handle MemTrackers with limit in this function - this should always
+    // succeed.
+    DCHECK(tracker->mem_tracker_ == nullptr || !tracker->mem_tracker_->has_limit());
+    bool success = tracker->TryConsumeFromMemTracker(bytes);
+    DCHECK(success);
+    if (tracker != other_path_to_common[0]) tracker->child_reservations_ += bytes;
+    tracker->DebugString();
+  }
+  
+  for (ReservationTracker* tracker : path_to_common) {
+    if (tracker != path_to_common[0]) tracker->child_reservations_ -= bytes;
+    tracker->UpdateReservation(-bytes);
+    tracker->ReleaseToMemTracker(bytes);
+    tracker->DebugString();
+  }
+
+  // Update the 'child_reservations_' on the common ancestor if needed.
+  // Case 1: reservation was pushed up to 'other'.
+  if (common_ancestor == other) {
+    lock_guard l(other->lock_);
+    other->child_reservations_ -= bytes;
+    other->DebugString();
+    other->CheckConsistency();
+  }
+  // Case 2: reservation was pushed down below 'this'.
+  if (common_ancestor == this) {
+    lock_guard l(lock_);
+    child_reservations_ += bytes;
+    DebugString();
+    CheckConsistency();
+  }
+  return true;
+}
+
+vector ReservationTracker::FindPathToRoot() {
+  vector path_to_root;
+  ReservationTracker* curr = this;
+  do {
+    path_to_root.push_back(curr);
+    curr = curr->parent_;
+  } while (curr != nullptr);
+  return path_to_root;
+}
+
+void ReservationTracker::AllocateFrom(int64_t bytes) {
+  lock_guard l(lock_);
+  DCHECK(initialized_);
+  DCHECK_GE(bytes, 0);
+  DCHECK_LE(bytes, unused_reservation());
+  UpdateUsedReservation(bytes);
+  CheckConsistency();
+}
+
+void ReservationTracker::ReleaseTo(int64_t bytes) {
+  lock_guard l(lock_);
+  DCHECK(initialized_);
+  DCHECK_GE(bytes, 0);
+  DCHECK_LE(bytes, used_reservation_);
+  UpdateUsedReservation(-bytes);
+  CheckConsistency();
+}
+
+int64_t ReservationTracker::GetReservation() {
+  lock_guard l(lock_);
+  DCHECK(initialized_);
+  return reservation_;
+}
+
+int64_t ReservationTracker::GetUsedReservation() {
+  lock_guard l(lock_);
+  DCHECK(initialized_);
+  return used_reservation_;
+}
+
+int64_t ReservationTracker::GetUnusedReservation() {
+  lock_guard l(lock_);
+  DCHECK(initialized_);
+  return unused_reservation();
+}
+
+int64_t ReservationTracker::GetChildReservations() {
+  lock_guard l(lock_);
+  DCHECK(initialized_);
+  return child_reservations_;
+}
+
+void ReservationTracker::CheckConsistency() const {
+  // Check internal invariants.
+  DCHECK_GE(reservation_, 0);
+  DCHECK_LE(reservation_, reservation_limit_);
+  DCHECK_GE(child_reservations_, 0);
+  DCHECK_GE(used_reservation_, 0);
+  DCHECK_LE(used_reservation_ + child_reservations_, reservation_);
+
+  DCHECK_EQ(reservation_, counters_.peak_reservation->current_value());
+  DCHECK_LE(reservation_, counters_.peak_reservation->value());
+  DCHECK_EQ(used_reservation_, counters_.peak_used_reservation->current_value());
+  DCHECK_LE(used_reservation_, counters_.peak_used_reservation->value());
+  if (counters_.reservation_limit != nullptr) {
+    DCHECK_EQ(reservation_limit_, counters_.reservation_limit->value());
+  }
+}
+
+void ReservationTracker::UpdateUsedReservation(int64_t delta) {
+  
+  used_reservation_ += delta;
+  COUNTER_SET(counters_.peak_used_reservation, used_reservation_);
+  VLOG_QUERY << "peak:" << counters_.peak_reservation->current_value() << " used reservation:" << reservation_;
+  CheckConsistency();
+}
+
+void ReservationTracker::UpdateReservation(int64_t delta) {
+  reservation_ += delta;
+  //LOG(INFO) << "chenhao tracker:" << tracker_name_ << " reservation:" << reservation_ 
+  //          << " delta:" << delta << " limit:" << reservation_limit_;
+  COUNTER_SET(counters_.peak_reservation, reservation_);
+  counters_.peak_reservation->set(reservation_);
+  CheckConsistency();
+}
+
+string ReservationTracker::DebugString() {
+  //lock_guard l(lock_);
+  if (!initialized_) return ": uninitialized";
+
+  string parent_debug_string = parent_ == nullptr ? "NULL" : parent_->DebugString();
+  std::stringstream ss;
+  ss << ": reservation_limit " << reservation_limit_
+     << " reservation " << reservation_ << " used_reservation " << used_reservation_
+     << " child_reservations " << child_reservations_ << " parent:\n"
+     << parent_debug_string;
+  return ss.str();
+}
+}
diff --git a/be/src/runtime/bufferpool/reservation_tracker.h b/be/src/runtime/bufferpool/reservation_tracker.h
new file mode 100644
index 0000000000..b029382a09
--- /dev/null
+++ b/be/src/runtime/bufferpool/reservation_tracker.h
@@ -0,0 +1,297 @@
+// Modifications copyright (C) 2017, Baidu.com, Inc.
+// Copyright 2017 The Apache Software Foundation
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef BDG_PALO_BE_RUNTIME_RESERVATION_TRACKER_H
+#define BDG_PALO_BE_RUNTIME_RESERVATION_TRACKER_H
+
+#include 
+#include 
+#include 
+#include 
+
+#include "runtime/bufferpool/reservation_tracker_counters.h"
+#include "common/status.h"
+#include "util/spinlock.h"
+
+namespace palo {
+
+class DummyProfile;
+class MemTracker;
+class RuntimeProfile;
+
+/// A tracker for a hierarchy of buffer pool memory reservations, denominated in bytes.
+/// A hierarchy of ReservationTrackers provides a mechanism for subdividing buffer pool
+/// memory and enforcing upper and lower bounds on memory usage.
+///
+/// The root of the tracker tree enforces a global maximum, which is distributed among its
+/// children. Each tracker in the tree has a 'reservation': the total bytes of buffer pool
+/// memory it is entitled to use. The reservation is inclusive of any memory that is
+/// already allocated from the reservation, i.e. using a reservation to allocate memory
+/// does not subtract from the reservation.
+///
+/// A reservation can be used directly at the tracker by calling AllocateFrom(), or
+/// distributed to children of the tracker for the childrens' reservations. Each tracker
+/// in the tree can use up to its reservation without checking parent trackers. To
+/// increase its reservation, a tracker must use some of its parent's reservation (and
+/// perhaps increase reservations all the way to the root of the tree).
+///
+/// Each tracker also has a maximum reservation that is enforced. E.g. if the root of the
+/// tracker hierarchy is the global tracker for the Impala daemon and the next level of
+/// the hierarchy is made up of per-query trackers, then the maximum reservation
+/// mechanism can enforce both process-level and query-level limits on reservations.
+///
+/// Invariants:
+/// * A tracker's reservation is at most its reservation limit: reservation <= limit
+/// * A tracker's reservation is at least the sum of its childrens' reservations plus
+///   the amount of the reservation used directly at this tracker. The difference is
+///   the unused reservation:
+///     child_reservations + used_reservation + unused_reservation = reservation.
+///
+/// Thread-safety:
+/// All public ReservationTracker methods are thread-safe. If multiple threads
+/// concurrently invoke methods on a ReservationTracker, each operation is applied
+/// atomically to leave the ReservationTracker in a consistent state. Calling threads
+/// are responsible for coordinating to avoid violating any method preconditions,
+/// e.g. ensuring that there is sufficient unused reservation before calling AllocateTo().
+///
+/// Integration with MemTracker hierarchy:
+/// TODO: we will remove MemTracker and this integration once all memory is accounted via
+/// reservations.
+///
+/// Each ReservationTracker can optionally have a linked MemTracker. E.g. an exec
+/// node's ReservationTracker can be linked with the exec node's MemTracker, so that
+/// reservations are included in query memory consumption for the purposes of enforcing
+/// memory limits, reporting and logging. The reservation is accounted as consumption
+/// against the linked MemTracker and its ancestors because reserved memory is committed.
+/// Allocating from a reservation therefore does not change the consumption reflected in
+/// the MemTracker hierarchy.
+///
+/// MemTracker limits are only checked via the topmost link (i.e. the query-level
+/// trackers): we require that no MemTrackers below this level have limits.
+///
+/// We require that the MemTracker hierarchy is consistent with the ReservationTracker
+/// hierarchy. I.e. if a ReservationTracker is linked to a MemTracker "A", and its parent
+/// is linked to a MemTracker "B", then "B" must be the parent of "A"'.
+class ReservationTracker {
+ public:
+  ReservationTracker();
+  virtual ~ReservationTracker();
+
+  /// Initializes the root tracker with the given reservation limit in bytes. The initial
+  /// reservation is 0.
+  /// if 'profile' is not NULL, the counters defined in ReservationTrackerCounters are
+  /// added to 'profile'.
+  void InitRootTracker(RuntimeProfile* profile, int64_t reservation_limit);
+
+  /// Initializes a new ReservationTracker with a parent.
+  /// If 'mem_tracker' is not NULL, reservations for this ReservationTracker and its
+  /// children will be counted as consumption against 'mem_tracker'.
+  /// 'reservation_limit' is the maximum reservation for this tracker in bytes.
+  /// if 'profile' is not NULL, the counters in 'counters_' are added to 'profile'.
+  void InitChildTracker(RuntimeProfile* profile, ReservationTracker* parent,
+      MemTracker* mem_tracker, int64_t reservation_limit);
+
+  /// If the tracker is initialized, deregister the ReservationTracker from its parent,
+  /// relinquishing all this tracker's reservation. All of the reservation must be unused
+  /// and all the tracker's children must be closed before calling this method.
+  /// TODO: decide on and implement policy for how far to release the reservation up
+  /// the tree. Currently the reservation is released all the way to the root.
+  void Close();
+
+  /// Request to increase reservation by 'bytes'. The request is either granted in
+  /// full or not at all. Uses any unused reservation on ancestors and increase
+  /// ancestors' reservations if needed to fit the increased reservation.
+  /// Returns true if the reservation increase is granted, or false if not granted.
+  /// If the reservation is not granted, no modifications are made to the state of
+  /// any ReservationTrackers.
+  bool IncreaseReservation(int64_t bytes) WARN_UNUSED_RESULT;
+
+  /// Tries to ensure that 'bytes' of unused reservation is available. If not already
+  /// available, tries to increase the reservation such that the unused reservation is
+  /// exactly equal to 'bytes'. Uses any unused reservation on ancestors and increase
+  /// ancestors' reservations if needed to fit the increased reservation.
+  /// Returns true if the reservation increase was successful or not necessary.
+  bool IncreaseReservationToFit(int64_t bytes) WARN_UNUSED_RESULT;
+
+  /// Decrease reservation by 'bytes' on this tracker and all ancestors. This tracker's
+  /// reservation must be at least 'bytes' before calling this method.
+  void DecreaseReservation(int64_t bytes) { DecreaseReservation(bytes, false); }
+
+  /// Transfer reservation from this tracker to 'other'. Both trackers must be in the
+  /// same query subtree of the hierarchy. One tracker can be the ancestor of the other,
+  /// or they can share a common ancestor. The subtree root must be at the query level
+  /// or below so that the transfer cannot cause a MemTracker limit to be exceeded
+  /// (because linked MemTrackers with limits below the query level are not supported).
+  /// Returns true on success or false if the transfer would have caused a reservation
+  /// limit to be exceeded.
+  bool TransferReservationTo(ReservationTracker* other, int64_t bytes) WARN_UNUSED_RESULT;
+
+  /// Allocate 'bytes' from the reservation. The tracker must have at least 'bytes'
+  /// unused reservation before calling this method.
+  void AllocateFrom(int64_t bytes);
+
+  /// Release 'bytes' of previously allocated memory. The used reservation is
+  /// decreased by 'bytes'. Before the call, the used reservation must be at least
+  /// 'bytes' before calling this method.
+  void ReleaseTo(int64_t bytes);
+
+  /// Returns the amount of the reservation in bytes.
+  int64_t GetReservation();
+
+  /// Returns the current amount of the reservation used at this tracker, not including
+  /// reservations of children in bytes.
+  int64_t GetUsedReservation();
+
+  /// Returns the amount of the reservation neither used nor given to childrens'
+  /// reservations at this tracker in bytes.
+  int64_t GetUnusedReservation();
+
+  /// Returns the total reservations of children in bytes.
+  int64_t GetChildReservations();
+
+  /// Support for debug actions: deny reservation increase with probability 'probability'.
+  void SetDebugDenyIncreaseReservation(double probability) {
+    increase_deny_probability_ = probability;
+  }
+
+  ReservationTracker* parent() const { return parent_; }
+
+  std::string DebugString();
+
+ private:
+  /// Returns the amount of 'reservation_' that is unused.
+  inline int64_t unused_reservation() const {
+    return reservation_ - used_reservation_ - child_reservations_;
+  }
+
+  /// Returns the parent's memtracker if 'parent_' is non-NULL, or NULL otherwise.
+  MemTracker* GetParentMemTracker() const {
+    return parent_ == nullptr ? nullptr : parent_->mem_tracker_;
+  }
+
+  /// Initializes 'counters_', storing the counters in 'profile'.
+  /// If 'profile' is NULL, creates a dummy profile to store the counters.
+  void InitCounters(RuntimeProfile* profile, int64_t max_reservation);
+
+  /// Internal helper for IncreaseReservation(). If 'use_existing_reservation' is true,
+  /// increase by the minimum amount so that 'bytes' fits in the reservation, otherwise
+  /// just increase by 'bytes'. If 'is_child_reservation' is true, also increase
+  /// 'child_reservations_' by 'bytes'.
+  /// 'lock_' must be held by caller.
+  bool IncreaseReservationInternalLocked(
+      int64_t bytes, bool use_existing_reservation, bool is_child_reservation);
+
+  /// Increase consumption on linked MemTracker to reflect an increase in reservation
+  /// of 'reservation_increase'. For the topmost link, return false if this failed
+  /// because it would exceed a memory limit. If there is no linked MemTracker, just
+  /// returns true.
+  /// TODO: remove once we account all memory via ReservationTrackers.
+  bool TryConsumeFromMemTracker(int64_t reservation_increase);
+
+  /// Decrease consumption on linked MemTracker to reflect a decrease in reservation of
+  /// 'reservation_decrease'. If there is no linked MemTracker, does nothing.
+  /// TODO: remove once we account all memory via ReservationTrackers.
+  void ReleaseToMemTracker(int64_t reservation_decrease);
+
+  /// Decrease reservation by 'bytes' on this tracker and all ancestors. This tracker's
+  /// reservation must be at least 'bytes' before calling this method. If
+  /// 'is_child_reservation' is true it decreases 'child_reservations_' by 'bytes'
+  void DecreaseReservation(int64_t bytes, bool is_child_reservation);
+
+  /// Same as DecreaseReservation(), but 'lock_' must be held by caller.
+  void DecreaseReservationLocked(int64_t bytes, bool is_child_reservation);
+
+  /// Return a vector containing the trackers on the path to the root tracker. Includes
+  /// the current tracker and the root tracker.
+  std::vector FindPathToRoot();
+
+  /// Return true if trackers in the subtree rooted at 'subtree1' precede trackers in
+  /// the subtree rooted at 'subtree2' in the lock order. 'subtree1' and 'subtree2'
+  /// must share the same parent.
+  static bool lock_sibling_subtree_first(
+      ReservationTracker* subtree1, ReservationTracker* subtree2) {
+    DCHECK_EQ(subtree1->parent_, subtree2->parent_);
+    return reinterpret_cast(subtree1) < reinterpret_cast(subtree2);
+  }
+
+  /// Check the internal consistency of the ReservationTracker and DCHECKs if in an
+  /// inconsistent state.
+  /// 'lock_' must be held by caller.
+  void CheckConsistency() const;
+
+  /// Increase or decrease 'used_reservation_' and update profile counters accordingly.
+  /// 'lock_' must be held by caller.
+  void UpdateUsedReservation(int64_t delta);
+
+  /// Increase or decrease 'reservation_' and update profile counters accordingly.
+  /// 'lock_' must be held by caller.
+  void UpdateReservation(int64_t delta);
+
+  /// Support for debug actions: see SetDebugDenyIncreaseReservation() for behaviour.
+  double increase_deny_probability_ = 0.0;
+
+  /// lock_ protects all below members. The lock order in a tree of ReservationTrackers is
+  /// based on a post-order traversal of the tree, with children visited in order of the
+  /// memory address of the ReservationTracker object. The following rules can be applied
+  /// to determine the relative positions of two trackers t1 and t2 in the lock order:
+  /// * If t1 is a descendent of t2, t1's lock must be acquired before t2's lock (i.e.
+  ///   locks are acquired bottom-up).
+  /// * If neither t1 or t2 is a descendant of the other, they must be in subtrees of
+  ///   under a common ancestor. If the memory address of t1's subtree's root is less
+  ///   than the memory address of t2's subtree's root, t1's lock must be acquired before
+  ///   t2's lock. This check is implemented in lock_sibling_subtree_first().
+  SpinLock lock_;
+
+  /// True if the tracker is initialized.
+  bool initialized_ = false;
+
+  /// A dummy profile to hold the counters in 'counters_' in the case that no profile
+  /// is provided.
+  boost::scoped_ptr dummy_profile_;
+
+  /// The RuntimeProfile counters for this tracker.
+  /// All non-NULL if 'initialized_' is true.
+  ReservationTrackerCounters counters_;
+
+  /// The parent of this tracker in the hierarchy. Does not change after initialization.
+  ReservationTracker* parent_ = nullptr;
+
+  /// If non-NULL, reservations are counted as memory consumption against this tracker.
+  /// Does not change after initialization. Not owned.
+  /// TODO: remove once all memory is accounted via ReservationTrackers.
+  MemTracker* mem_tracker_ = nullptr;
+
+  /// The maximum reservation in bytes that this tracker can have.
+  int64_t reservation_limit_;
+
+  /// This tracker's current reservation in bytes. 'reservation_' <= 'reservation_limit_'.
+  int64_t reservation_;
+
+  /// Total reservation of children in bytes. This is included in 'reservation_'.
+  /// 'used_reservation_' + 'child_reservations_' <= 'reservation_'.
+  int64_t child_reservations_;
+
+  /// The amount of the reservation currently used by this tracker in bytes.
+  /// 'used_reservation_' + 'child_reservations_' <= 'reservation_'.
+  int64_t used_reservation_;
+};
+}
+
+#endif
diff --git a/be/src/runtime/bufferpool/reservation_tracker_counters.h b/be/src/runtime/bufferpool/reservation_tracker_counters.h
index 3db0f0e59a..0f6400d740 100644
--- a/be/src/runtime/bufferpool/reservation_tracker_counters.h
+++ b/be/src/runtime/bufferpool/reservation_tracker_counters.h
@@ -18,8 +18,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#ifndef IMPALA_RUNTIME_RESERVATION_TRACKER_COUNTERS_H
-#define IMPALA_RUNTIME_RESERVATION_TRACKER_COUNTERS_H
+#ifndef BDG_PALO_BE_RUNTIME_RESERVATION_TRACKER_COUNTERS_H
+#define BDG_PALO_BE_RUNTIME_RESERVATION_TRACKER_COUNTERS_H
 
 #include "util/runtime_profile.h"
 
diff --git a/be/src/runtime/bufferpool/reservation_util.cc b/be/src/runtime/bufferpool/reservation_util.cc
new file mode 100644
index 0000000000..c763b5abad
--- /dev/null
+++ b/be/src/runtime/bufferpool/reservation_util.cc
@@ -0,0 +1,44 @@
+// Modifications copyright (C) 2017, Baidu.com, Inc.
+// Copyright 2017 The Apache Software Foundation
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "runtime/bufferpool/reservation_util.h"
+
+#include 
+
+namespace palo {
+
+// Most operators that accumulate memory use reservations, so the majority of memory
+// should be allocated to buffer reservations, as a heuristic.
+const double ReservationUtil::RESERVATION_MEM_FRACTION = 0.8;
+const int64_t ReservationUtil::RESERVATION_MEM_MIN_REMAINING = 75 * 1024 * 1024;
+
+int64_t ReservationUtil::GetReservationLimitFromMemLimit(int64_t mem_limit) {
+  int64_t max_reservation = std::min(
+      RESERVATION_MEM_FRACTION * mem_limit, mem_limit - RESERVATION_MEM_MIN_REMAINING);
+  return std::max(0, max_reservation);
+}
+
+int64_t ReservationUtil::GetMinMemLimitFromReservation(int64_t buffer_reservation) {
+  buffer_reservation = std::max(0, buffer_reservation);
+  return std::max(
+      buffer_reservation * (1.0 / ReservationUtil::RESERVATION_MEM_FRACTION),
+      buffer_reservation + ReservationUtil::RESERVATION_MEM_MIN_REMAINING);
+}
+}
diff --git a/be/src/runtime/bufferpool/reservation_util.h b/be/src/runtime/bufferpool/reservation_util.h
new file mode 100644
index 0000000000..e37b51983f
--- /dev/null
+++ b/be/src/runtime/bufferpool/reservation_util.h
@@ -0,0 +1,77 @@
+// Modifications copyright (C) 2017, Baidu.com, Inc.
+// Copyright 2017 The Apache Software Foundation
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef BDG_PALO_BE_RUNTIME_BUFFERPOOL_RESERVATION_UTIL_H_
+#define BDG_PALO_BE_RUNTIME_BUFFERPOOL_RESERVATION_UTIL_H_
+
+#include 
+
+namespace palo {
+
+/// Utility code related to buffer reservations.
+class ReservationUtil {
+ public:
+  /// There are currently two classes of memory: reserved memory (i.e. memory that is
+  /// reserved with reservation trackers/allocated by the buffer pool), and unreserved
+  /// memory (i.e. everything else; code that hasn't yet been updated to use reserved
+  /// memory). Eventually, all memory should be in the former category, but each operator
+  /// must be converted to use reserved memory and that work is ongoing. See IMPALA-4834.
+  /// In the meantime, the system memory must be shared between these two classes of
+  /// memory. RESERVATION_MEM_FRACTION and RESERVATION_MEM_MIN_REMAINING are used to
+  /// determine an upper bound on reserved memory for a query. Operators operate reliably
+  /// when they are using bounded reserved memory (e.g. staying under a limit by
+  /// spilling), but will generally fail if they hit a limit when trying to allocate
+  /// unreserved memory. Thus we need to ensure there is always space left in the query
+  /// memory limit for unreserved memory.
+
+  /// The fraction of the query mem limit that is used as the maximum buffer reservation
+  /// limit, i.e. the bound on reserved memory. It is expected that unreserved memory
+  /// (i.e. not accounted by buffer reservation trackers) stays within
+  /// (1 - RESERVATION_MEM_FRACTION).
+  /// TODO: remove once all operators use buffer reservations.
+  static const double RESERVATION_MEM_FRACTION;
+
+  /// The minimum amount of memory that should be left after buffer reservations, i.e.
+  /// this is the minimum amount of memory that should be left for unreserved memory.
+  /// TODO: remove once all operators use buffer reservations.
+  static const int64_t RESERVATION_MEM_MIN_REMAINING;
+
+  /// Helper function to get the query buffer reservation limit (in bytes) given a query
+  /// mem_limit. In other words, this determines the maximum portion of the mem_limit
+  /// that should go to reserved memory. The limit on reservations is computed as:
+  /// min(query_limit * RESERVATION_MEM_FRACTION,
+  ///     query_limit - RESERVATION_MEM_MIN_REMAINING)
+  /// TODO: remove once all operators use buffer reservations.
+  static int64_t GetReservationLimitFromMemLimit(int64_t mem_limit);
+
+  /// Helper function to get the minimum query mem_limit (in bytes) that will be large
+  /// enough for a buffer reservation of size 'buffer_reservation' bytes. In other words,
+  /// this determines the minimum mem_limit that will be large enough to accomidate
+  /// 'buffer_reservation' reserved memory, as well as some amount of unreserved memory
+  /// (determined by a heuristic).
+  /// The returned mem_limit X satisfies:
+  ///    buffer_reservation <= GetReservationLimitFromMemLimit(X)
+  /// TODO: remove once all operators use buffer reservations.
+  static int64_t GetMinMemLimitFromReservation(int64_t buffer_reservation);
+};
+
+}
+
+#endif
diff --git a/be/src/runtime/bufferpool/suballocator.cc b/be/src/runtime/bufferpool/suballocator.cc
new file mode 100644
index 0000000000..49b1f8d0e8
--- /dev/null
+++ b/be/src/runtime/bufferpool/suballocator.cc
@@ -0,0 +1,249 @@
+// Modifications copyright (C) 2017, Baidu.com, Inc.
+// Copyright 2017 The Apache Software Foundation
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "runtime/bufferpool/suballocator.h"
+
+#include 
+
+#include "runtime/bufferpool/reservation_tracker.h"
+#include "util/bit_util.h"
+
+#include "common/names.h"
+#include "gutil/strings/substitute.h"
+
+namespace palo {
+
+constexpr int Suballocator::LOG_MAX_ALLOCATION_BYTES;
+constexpr int64_t Suballocator::MAX_ALLOCATION_BYTES;
+constexpr int Suballocator::LOG_MIN_ALLOCATION_BYTES;
+constexpr int64_t Suballocator::MIN_ALLOCATION_BYTES;
+//const int Suballocator::NUM_FREE_LISTS;
+
+Suballocator::Suballocator(
+    BufferPool* pool, BufferPool::ClientHandle* client, int64_t min_buffer_len)
+  : pool_(pool), client_(client), min_buffer_len_(min_buffer_len), allocated_(0) {}
+
+Suballocator::~Suballocator() {
+  // All allocations should be free and buffers deallocated.
+  DCHECK_EQ(allocated_, 0);
+  for (int i = 0; i < NUM_FREE_LISTS; ++i) {
+    DCHECK(free_lists_[i] == nullptr);
+  }
+}
+
+Status Suballocator::Allocate(int64_t bytes, unique_ptr* result) {
+  DCHECK_GE(bytes, 0);
+  if (UNLIKELY(bytes > MAX_ALLOCATION_BYTES)) {
+    std::stringstream err_stream;
+    err_stream << "Requested memory allocation of "  << bytes
+               << " bytes, larger than max " << "supported of " << MAX_ALLOCATION_BYTES
+               << " bytes";
+    return Status(err_stream.str());
+  }
+  unique_ptr free_node;
+  bytes = max(bytes, MIN_ALLOCATION_BYTES);
+  const int target_list_idx = ComputeListIndex(bytes);
+  for (int i = target_list_idx; i < NUM_FREE_LISTS; ++i) {
+    free_node = PopFreeListHead(i);
+    if (free_node != nullptr) break;
+  }
+
+  if (free_node == nullptr) {
+    // Unable to find free allocation, need to get more memory from buffer pool.
+    RETURN_IF_ERROR(AllocateBuffer(bytes, &free_node));
+    if (free_node == nullptr) {
+      *result = nullptr;
+      return Status::OK;
+    }
+  }
+
+  // Free node may be larger than required.
+  const int free_list_idx = ComputeListIndex(free_node->len_);
+  if (free_list_idx != target_list_idx) {
+    RETURN_IF_ERROR(SplitToSize(move(free_node), bytes, &free_node));
+    DCHECK(free_node != nullptr);
+  }
+
+  free_node->in_use_ = true;
+  allocated_ += free_node->len_;
+  *result = move(free_node);
+  return Status::OK;
+}
+
+int Suballocator::ComputeListIndex(int64_t bytes) const {
+  return BitUtil::Log2CeilingNonZero64(bytes) - LOG_MIN_ALLOCATION_BYTES;
+}
+
+Status Suballocator::AllocateBuffer(int64_t bytes, unique_ptr* result) {
+  DCHECK_LE(bytes, MAX_ALLOCATION_BYTES);
+  const int64_t buffer_len = max(min_buffer_len_, BitUtil::RoundUpToPowerOfTwo(bytes));
+  if (!client_->IncreaseReservationToFit(buffer_len)) {
+    *result = nullptr;
+    return Status::OK;
+  }
+
+  unique_ptr free_node;
+  RETURN_IF_ERROR(Suballocation::Create(&free_node));
+  RETURN_IF_ERROR(pool_->AllocateBuffer(client_, buffer_len, &free_node->buffer_));
+
+  free_node->data_ = free_node->buffer_.data();
+  free_node->len_ = buffer_len;
+  *result = move(free_node);
+  return Status::OK;
+}
+
+Status Suballocator::SplitToSize(unique_ptr free_node,
+    int64_t target_bytes, unique_ptr* result) {
+  DCHECK(!free_node->in_use_);
+  DCHECK_GT(free_node->len_, target_bytes);
+
+  const int free_list_idx = ComputeListIndex(free_node->len_);
+  const int target_list_idx = ComputeListIndex(target_bytes);
+
+  // Preallocate nodes to avoid handling allocation failures during splitting.
+  // Need two nodes per level for the left and right children.
+  const int num_nodes = (free_list_idx - target_list_idx) * 2;
+  constexpr int MAX_NUM_NODES = NUM_FREE_LISTS * 2;
+  unique_ptr nodes[MAX_NUM_NODES];
+  for (int i = 0; i < num_nodes; ++i) {
+    if (!Suballocation::Create(&nodes[i]).ok()) {
+      // Add the free node to the free list to restore the allocator to an internally
+      // consistent state.
+      AddToFreeList(move(free_node));
+      return Status("Failed to allocate list node in Suballocator");
+    }
+  }
+
+  // Iteratively split from the current size down to the target size. We will return
+  // the leftmost descendant node.
+  int next_node = 0;
+  for (int i = free_list_idx; i > target_list_idx; --i) {
+    DCHECK_EQ(free_node->len_, 1LL << (i + LOG_MIN_ALLOCATION_BYTES));
+    unique_ptr left_child = move(nodes[next_node++]);
+    unique_ptr right_child = move(nodes[next_node++]);
+    DCHECK_LE(next_node, num_nodes);
+
+    const int64_t child_len = free_node->len_ / 2;
+    left_child->data_ = free_node->data_;
+    right_child->data_ = free_node->data_ + child_len;
+    left_child->len_ = right_child->len_ = child_len;
+    left_child->buddy_ = right_child.get();
+    right_child->buddy_ = left_child.get();
+    free_node->in_use_ = true;
+    left_child->parent_ = move(free_node);
+
+    AddToFreeList(move(right_child));
+    free_node = move(left_child);
+  }
+  *result = move(free_node);
+  return Status::OK;
+}
+
+void Suballocator::Free(unique_ptr allocation) {
+  if (allocation == nullptr) return;
+
+  DCHECK(allocation->in_use_);
+  allocation->in_use_ = false;
+  allocated_ -= allocation->len_;
+
+  // Iteratively coalesce buddies until the buddy is in use or we get to the root.
+  // This ensures that all buddies in the free lists are coalesced. I.e. we do not
+  // have two buddies in the same free list.
+  unique_ptr curr_allocation = move(allocation);
+  while (curr_allocation->buddy_ != nullptr) {
+    if (curr_allocation->buddy_->in_use_) {
+      // If the buddy is not free we can't coalesce, just add it to free list.
+      AddToFreeList(move(curr_allocation));
+      return;
+    }
+    unique_ptr buddy = RemoveFromFreeList(curr_allocation->buddy_);
+    curr_allocation = CoalesceBuddies(move(curr_allocation), move(buddy));
+  }
+
+  // Reached root, which is an entire free buffer. We are not using it, so free up memory.
+  DCHECK(curr_allocation->buffer_.is_open());
+  pool_->FreeBuffer(client_, &curr_allocation->buffer_);
+  curr_allocation.reset();
+}
+
+void Suballocator::AddToFreeList(unique_ptr node) {
+  DCHECK(!node->in_use_);
+  int list_idx = ComputeListIndex(node->len_);
+  if (free_lists_[list_idx] != nullptr) {
+    free_lists_[list_idx]->prev_free_ = node.get();
+  }
+  node->next_free_ = move(free_lists_[list_idx]);
+  DCHECK(node->prev_free_ == nullptr);
+  free_lists_[list_idx] = move(node);
+}
+
+unique_ptr Suballocator::RemoveFromFreeList(Suballocation* node) {
+  DCHECK(node != nullptr);
+  const int list_idx = ComputeListIndex(node->len_);
+
+  if (node->next_free_ != nullptr) {
+    node->next_free_->prev_free_ = node->prev_free_;
+  }
+
+  unique_ptr* ptr_from_prev = node->prev_free_ == nullptr ?
+      &free_lists_[list_idx] :
+      &node->prev_free_->next_free_;
+  node->prev_free_ = nullptr;
+  unique_ptr result = move(*ptr_from_prev);
+  *ptr_from_prev = move(node->next_free_);
+  return result;
+}
+
+unique_ptr Suballocator::PopFreeListHead(int list_idx) {
+  if (free_lists_[list_idx] == nullptr) return nullptr;
+  unique_ptr result = move(free_lists_[list_idx]);
+  DCHECK(result->prev_free_ == nullptr);
+  if (result->next_free_ != nullptr) {
+    result->next_free_->prev_free_ = nullptr;
+  }
+  free_lists_[list_idx] = move(result->next_free_);
+  return result;
+}
+
+unique_ptr Suballocator::CoalesceBuddies(
+    unique_ptr b1, unique_ptr b2) {
+  DCHECK(!b1->in_use_);
+  DCHECK(!b2->in_use_);
+  DCHECK_EQ(b1->buddy_, b2.get());
+  DCHECK_EQ(b2->buddy_, b1.get());
+  // Only the left child's parent should be present.
+  DCHECK((b1->parent_ != nullptr) ^ (b2->parent_ != nullptr));
+  unique_ptr parent =
+      b1->parent_ != nullptr ? move(b1->parent_) : move(b2->parent_);
+  parent->in_use_ = false;
+  return parent;
+}
+
+Status Suballocation::Create(unique_ptr* new_suballocation) {
+  // Allocate from system allocator for simplicity. We don't expect this to be
+  // performance critical or to be used for small allocations where CPU/memory
+  // overhead of these allocations might be a consideration.
+  new_suballocation->reset(new (nothrow) Suballocation());
+  if (*new_suballocation == nullptr) {
+    return Status(TStatusCode::MEM_ALLOC_FAILED);
+  }
+  return Status::OK;
+}
+}
diff --git a/be/src/runtime/bufferpool/suballocator.h b/be/src/runtime/bufferpool/suballocator.h
new file mode 100644
index 0000000000..50f785ec82
--- /dev/null
+++ b/be/src/runtime/bufferpool/suballocator.h
@@ -0,0 +1,224 @@
+// Modifications copyright (C) 2017, Baidu.com, Inc.
+// Copyright 2017 The Apache Software Foundation
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef IMPALA_RUNTIME_BUFFERPOOL_SUBALLOCATOR_H
+#define IMPALA_RUNTIME_BUFFERPOOL_SUBALLOCATOR_H
+
+#include 
+#include 
+
+#include "runtime/bufferpool/buffer_pool.h"
+
+namespace palo {
+
+class Suballocation;
+
+/// Helper class to subdivide buffers from the buffer pool. Implements a buddy
+/// allocation algorithm optimised for power-of-two allocations. At or above the
+/// 'min_buffer_len' value, each allocation is backed by a power-of-two buffer from
+/// a BufferPool. Below that threshold, each allocation is backed by a
+/// 'min_buffer_len' buffer split recursively into equal-sized buddies until the
+/// desired allocation size is reached. Every time an allocation is freed,
+/// free buddies are coalesced eagerly and whole buffers are freed eagerly.
+///
+/// The algorithms used are asymptotically efficient: O(log(max allocation size)), but
+/// the implementation's constant-factor overhead is not optimised. Thus, the allocator
+/// is best suited for relatively large allocations where the constant CPU/memory
+/// overhead per allocation is not paramount, e.g. bucket directories of hash tables.
+/// All allocations less than MIN_ALLOCATION_BYTES are rounded up to that amount.
+///
+/// Methods of Suballocator are not thread safe.
+///
+/// Implementation:
+/// ---------------
+/// The allocator uses two key data structures: a number of binary trees representing
+/// the buddy relationships between allocations and a set of free lists, one for each
+/// power-of-two size.
+///
+/// Each buffer allocated from the buffer pool has a tree of Suballocations associated
+/// with it that use the memory from that buffer. The root of the tree is the
+/// Suballocation corresponding to the entire buffer. Each node has either zero children
+/// (if it hasn't been split) or two children (if it has been split into two buddy
+/// allocations). Each non-root Suballocation has pointers to its buddy and its parent
+/// to enable coalescing the buddies into the parent when both are free.
+///
+/// Suballocations are eagerly coalesced when freed, so a Suballocation only has children
+/// if one of its descendants is allocated.
+///
+/// The free lists are doubly-linked lists of free Suballocation objects that support
+/// O(1) add and remove. The next and previous pointers are stored in the
+/// Suballocation object so no auxiliary memory is required.
+class Suballocator {
+ public:
+  /// Constructs a suballocator that allocates memory from 'pool' with 'client'.
+  /// Suballocations smaller than 'min_buffer_len' are handled by allocating a
+  /// buffer of 'min_buffer_len' and recursively splitting it.
+  Suballocator(
+      BufferPool* pool, BufferPool::ClientHandle* client, int64_t min_buffer_len);
+
+  ~Suballocator();
+
+  /// Allocate bytes from BufferPool. The allocation is nullptr if unsuccessful because
+  /// the client's reservation was insufficient. If an unexpected error is encountered,
+  /// returns that status. The allocation size is rounded up to the next power-of-two.
+  /// The caller must always free the allocation by calling Free() (otherwise destructing
+  /// the returned 'result' will DCHECK on debug builds or otherwise misbehave on release
+  /// builds).
+  ///
+  /// Allocate() will try to increase the client's buffer pool reservation to fulfill
+  /// the requested allocation if needed.
+  ///
+  /// The memory returned is at least 8-byte aligned.
+  Status Allocate(int64_t bytes, std::unique_ptr* result);
+
+  /// Free the allocation. Does nothing if allocation is nullptr (e.g. was the result of a
+  /// failed Allocate() call).
+  void Free(std::unique_ptr allocation);
+
+  /// Upper bounds on the max allocation size and the number of different
+  /// power-of-two allocation sizes. Used to bound the number of free lists.
+  static constexpr int LOG_MAX_ALLOCATION_BYTES = BufferPool::LOG_MAX_BUFFER_BYTES;
+  static constexpr int64_t MAX_ALLOCATION_BYTES = BufferPool::MAX_BUFFER_BYTES;
+
+  /// Don't support allocations less than 4kb to avoid high overhead.
+  static constexpr int LOG_MIN_ALLOCATION_BYTES = 12;
+  static constexpr int64_t MIN_ALLOCATION_BYTES = 1L << LOG_MIN_ALLOCATION_BYTES;
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(Suballocator);
+
+  /// Compute the index for allocations of size 'bytes' in 'free_lists_'. 'bytes' is
+  /// rounded up to the next power-of-two if it is not already a power-of-two.
+  int ComputeListIndex(int64_t bytes) const;
+
+  /// Allocate a buffer of size 'bytes' < MAX_ALLOCATION_BYTES from the buffer pool and
+  /// initialize 'result' with it. If the reservation is insufficient, try to increase
+  /// the reservation to fit.
+  Status AllocateBuffer(int64_t bytes, std::unique_ptr* result);
+
+  /// Split the free allocation until we get an allocation of 'target_bytes' rounded up
+  /// to a power-of-two. This allocation is returned. The other allocations resulting
+  /// from the splits are added to free lists. node->in_use must be false and 'node'
+  /// must not be in any free list. Can fail if allocating memory for data structures
+  /// fails.
+  Status SplitToSize(std::unique_ptr node, int64_t target_bytes,
+      std::unique_ptr* result);
+
+  // Add allocation to the free list with given index.
+  void AddToFreeList(std::unique_ptr node);
+
+  // Remove allocation from its free list.
+  std::unique_ptr RemoveFromFreeList(Suballocation* node);
+
+  // Get the allocation at the head of the free list at index 'list_idx'. Return nullptr
+  // if list is empty.
+  std::unique_ptr PopFreeListHead(int list_idx);
+
+  /// Coalesce two free buddies, 'b1' and 'b2'. Frees 'b1' and 'b2' and marks the parent
+  /// not in use.
+  std::unique_ptr CoalesceBuddies(
+      std::unique_ptr b1, std::unique_ptr b2);
+
+  /// The pool and corresponding client to allocate buffers from.
+  BufferPool* pool_;
+  BufferPool::ClientHandle* client_;
+
+  /// The minimum length of buffer to allocate. To serve allocations below this threshold,
+  /// a larger buffer is allocated and split into multiple allocations.
+  const int64_t min_buffer_len_;
+
+  /// Track how much memory has been returned in allocations but not freed.
+  int64_t allocated_;
+
+  /// Free lists for each supported power-of-two size. Statically allocate the maximum
+  /// possible number of lists for simplicity. Indexed by log2 of the allocation size
+  /// minus log2 of the minimum allocation size, e.g. 16k allocations are at index 2.
+  /// Each free list should only include one buddy of each pair: if both buddies are
+  /// free, they should have been coalesced.
+  ///
+  /// Each free list is implemented as a doubly-linked list.
+  static constexpr int NUM_FREE_LISTS =
+      LOG_MAX_ALLOCATION_BYTES - LOG_MIN_ALLOCATION_BYTES + 1;
+  std::unique_ptr free_lists_[NUM_FREE_LISTS];
+};
+
+/// An allocation made by a Suballocator. Each allocation returned by Suballocator must
+/// be freed with Suballocator::Free().
+///
+/// Unique_ptr is used to manage ownership of these Suballocations as a guard against
+/// memory leaks. The owner of the unique_ptr is either:
+/// - client code, if the suballocation is in use
+/// - the free list array, if the suballocation is the head of a free list
+/// - the previous free list entry, if the suballocation is a subsequent free list entry
+/// - the suballocation's left child, if the suballocation is split
+class Suballocation {
+ public:
+  // Checks that the allocation is not in use (therefore not leaked).
+  ~Suballocation() { DCHECK(!in_use_); }
+
+  uint8_t* data() const { return data_; }
+  int64_t len() const { return len_; }
+
+ private:
+  friend class Suballocator;
+
+  DISALLOW_COPY_AND_ASSIGN(Suballocation);
+
+  /// Static constructor for Suballocation. Can fail if new fails to allocate memory.
+  static Status Create(std::unique_ptr* new_suballocation);
+
+  // The actual constructor - Create() is used for its better error handling.
+  Suballocation()
+    : data_(nullptr), len_(-1), buddy_(nullptr), prev_free_(nullptr), in_use_(false) {}
+
+  /// The allocation's data and its length.
+  uint8_t* data_;
+  int64_t len_;
+
+  /// The buffer backing the Suballocation, if the Suballocation is backed by an entire
+  /// buffer. Otherwise uninitialized. 'buffer_' is open iff 'buddy_' is nullptr.
+  BufferPool::BufferHandle buffer_;
+
+  /// If this is a left child, the parent of this and its buddy. The parent's allocation
+  /// is the contiguous memory buffer comprised of the two allocations. We store the
+  /// parent in only the left child so that it is uniquely owned.
+  std::unique_ptr parent_;
+
+  /// The buddy allocation of this allocation. The buddy's memory buffer is the same
+  /// size and adjacent in memory. Two buddy Suballocation objects have the same
+  /// lifetime: they are created in SplitToSize() and destroyed in CoalesceBuddies().
+  Suballocation* buddy_;
+
+  /// If this is in a free list, the next element in the list. nullptr if this is the last
+  /// element in the free list. This pointer owns the next element in the linked list,
+  /// which itself stores a raw back-pointer.
+  std::unique_ptr next_free_;
+
+  /// If this is in a free list, the previous element in the list. nullptr if this is the
+  /// first element. If non-nullptr, this Suballocation is owned by 'prev_free_'.
+  Suballocation* prev_free_;
+
+  /// True if was returned from Allocate() and hasn't been freed yet, or if it has been
+  /// split into two child Suballocations.
+  bool in_use_;
+};
+}
+
+#endif
diff --git a/be/src/runtime/bufferpool/system_allocator.cc b/be/src/runtime/bufferpool/system_allocator.cc
new file mode 100644
index 0000000000..3af208ee60
--- /dev/null
+++ b/be/src/runtime/bufferpool/system_allocator.cc
@@ -0,0 +1,175 @@
+// Modifications copyright (C) 2017, Baidu.com, Inc.
+// Copyright 2017 The Apache Software Foundation
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "runtime/bufferpool/system_allocator.h"
+
+#include 
+
+#include 
+
+#include "gutil/strings/substitute.h"
+#include "util/bit_util.h"
+
+#include "common/names.h"
+#include "common/config.h"
+#include "util/error_util.h"
+
+// TODO: IMPALA-5073: this should eventually become the default once we are confident
+// that it is superior to allocating via TCMalloc.
+//DEFINE_bool(mmap_buffers, false,
+//    "(Experimental) If true, allocate buffers directly from the operating system "
+//    "instead of with TCMalloc.");
+
+//DEFINE_bool(madvise_huge_pages, true,
+//    "(Advanced) If true, advise operating system to back large memory buffers with huge "
+//    "pages");
+
+namespace palo {
+
+/// These are the page sizes on x86-64. We could parse /proc/meminfo to programmatically
+/// get this, but it is unlikely to change unless we port to a different architecture.
+static int64_t SMALL_PAGE_SIZE = 4LL * 1024;
+static int64_t HUGE_PAGE_SIZE = 2LL * 1024 * 1024;
+
+SystemAllocator::SystemAllocator(int64_t min_buffer_len)
+  : min_buffer_len_(min_buffer_len) {
+  DCHECK(BitUtil::IsPowerOf2(min_buffer_len));
+#if !defined(ADDRESS_SANITIZER) && !defined(THREAD_SANITIZER) && !defined(LEAK_SANITIZER)
+  // Free() assumes that aggressive decommit is enabled for TCMalloc.
+  size_t aggressive_decommit_enabled;
+  MallocExtension::instance()->GetNumericProperty(
+      "tcmalloc.aggressive_memory_decommit", &aggressive_decommit_enabled);
+  CHECK_EQ(true, aggressive_decommit_enabled);
+#endif
+}
+
+Status SystemAllocator::Allocate(int64_t len, BufferPool::BufferHandle* buffer) {
+  DCHECK_GE(len, min_buffer_len_);
+  DCHECK_LE(len, BufferPool::MAX_BUFFER_BYTES);
+  DCHECK(BitUtil::IsPowerOf2(len)) << len;
+
+  uint8_t* buffer_mem;
+  if (config::FLAGS_mmap_buffers) {
+    RETURN_IF_ERROR(AllocateViaMMap(len, &buffer_mem));
+  } else {
+    RETURN_IF_ERROR(AllocateViaMalloc(len, &buffer_mem));
+  }
+  buffer->Open(buffer_mem, len, CpuInfo::get_current_core());
+  return Status::OK;
+}
+
+Status SystemAllocator::AllocateViaMMap(int64_t len, uint8_t** buffer_mem) {
+  int64_t map_len = len;
+  bool use_huge_pages = len % HUGE_PAGE_SIZE == 0 && config::FLAGS_madvise_huge_pages;
+  if (use_huge_pages) {
+    // Map an extra huge page so we can fix up the alignment if needed.
+    map_len += HUGE_PAGE_SIZE;
+  }
+  uint8_t* mem = reinterpret_cast(
+      mmap(nullptr, map_len, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0));
+  if (mem == MAP_FAILED) {
+    return Status(TStatusCode::BUFFER_ALLOCATION_FAILED);
+  }
+
+  if (use_huge_pages) {
+    // mmap() may return memory that is not aligned to the huge page size. For the
+    // subsequent madvise() call to work well, we need to align it ourselves and
+    // unmap the memory on either side of the buffer that we don't need.
+    uintptr_t misalignment = reinterpret_cast(mem) % HUGE_PAGE_SIZE;
+    if (misalignment != 0) {
+      uintptr_t fixup = HUGE_PAGE_SIZE - misalignment;
+      munmap(mem, fixup);
+      mem += fixup;
+      map_len -= fixup;
+    }
+    munmap(mem + len, map_len - len);
+    DCHECK_EQ(reinterpret_cast(mem) % HUGE_PAGE_SIZE, 0) << mem;
+    // Mark the buffer as a candidate for promotion to huge pages. The Linux Transparent
+    // Huge Pages implementation will try to back the memory with a huge page if it is
+    // enabled. MADV_HUGEPAGE was introduced in 2.6.38, so we similarly need to skip this
+    // code if we are compiling against an older kernel.
+#ifdef MADV_HUGEPAGE
+    int rc;
+    // According to madvise() docs it may return EAGAIN to signal that we should retry.
+    do {
+      rc = madvise(mem, len, MADV_HUGEPAGE);
+    } while (rc == -1 && errno == EAGAIN);
+    DCHECK(rc == 0) << "madvise(MADV_HUGEPAGE) shouldn't fail" << errno;
+#endif
+  }
+  *buffer_mem = mem;
+  return Status::OK;
+}
+
+Status SystemAllocator::AllocateViaMalloc(int64_t len, uint8_t** buffer_mem) {
+  bool use_huge_pages = len % HUGE_PAGE_SIZE == 0 && config::FLAGS_madvise_huge_pages;
+  // Allocate, aligned to the page size that we expect to back the memory range.
+  // This ensures that it can be backed by a whole pages, rather than parts of pages.
+  size_t alignment = use_huge_pages ? HUGE_PAGE_SIZE : SMALL_PAGE_SIZE;
+  int rc = posix_memalign(reinterpret_cast(buffer_mem), alignment, len);
+#if !defined(ADDRESS_SANITIZER) && !defined(LEAK_SANITIZER)
+  // Workaround ASAN bug where posix_memalign returns 0 even when allocation fails.
+  // It should instead return ENOMEM. See https://bugs.llvm.org/show_bug.cgi?id=32968.
+  if (rc == 0 && *buffer_mem == nullptr && len != 0) rc = ENOMEM;
+#endif
+  if (rc != 0) {
+    std::stringstream ss;
+    ss << "posix_memalign() failed to allocate buffer: " << get_str_err_msg();
+    return Status(ss.str());
+  }
+  if (use_huge_pages) {
+#ifdef MADV_HUGEPAGE
+    // According to madvise() docs it may return EAGAIN to signal that we should retry.
+    do {
+      rc = madvise(*buffer_mem, len, MADV_HUGEPAGE);
+    } while (rc == -1 && errno == EAGAIN);
+    DCHECK(rc == 0) << "madvise(MADV_HUGEPAGE) shouldn't fail" << errno;
+#endif
+  }
+  return Status::OK;
+}
+
+void SystemAllocator::Free(BufferPool::BufferHandle&& buffer) {
+  if (config::FLAGS_mmap_buffers) {
+    int rc = munmap(buffer.data(), buffer.len());
+    DCHECK_EQ(rc, 0) << "Unexpected munmap() error: " << errno;
+  } else {
+    bool use_huge_pages = buffer.len() % HUGE_PAGE_SIZE == 0 && config::FLAGS_madvise_huge_pages;
+    if (use_huge_pages) {
+      // Undo the madvise so that is isn't a candidate to be newly backed by huge pages.
+      // We depend on TCMalloc's "aggressive decommit" mode decommitting the physical
+      // huge pages with madvise(DONTNEED) when we call free(). Otherwise, this huge
+      // page region may be divvied up and subsequently decommitted in smaller chunks,
+      // which may not actually release the physical memory, causing Impala physical
+      // memory usage to exceed the process limit.
+#ifdef MADV_NOHUGEPAGE
+      // According to madvise() docs it may return EAGAIN to signal that we should retry.
+      int rc;
+      do {
+        rc = madvise(buffer.data(), buffer.len(), MADV_NOHUGEPAGE);
+      } while (rc == -1 && errno == EAGAIN);
+      DCHECK(rc == 0) << "madvise(MADV_NOHUGEPAGE) shouldn't fail" << errno;
+#endif
+    }
+    free(buffer.data());
+  }
+  buffer.Reset(); // Avoid DCHECK in ~BufferHandle().
+}
+}
diff --git a/be/src/runtime/bufferpool/system_allocator.h b/be/src/runtime/bufferpool/system_allocator.h
new file mode 100644
index 0000000000..1d9715e25f
--- /dev/null
+++ b/be/src/runtime/bufferpool/system_allocator.h
@@ -0,0 +1,56 @@
+// Modifications copyright (C) 2017, Baidu.com, Inc.
+// Copyright 2017 The Apache Software Foundation
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef IMPALA_RUNTIME_SYSTEM_ALLOCATOR_H
+#define IMPALA_RUNTIME_SYSTEM_ALLOCATOR_H
+
+#include "common/status.h"
+
+#include "runtime/bufferpool/buffer_pool.h"
+
+namespace palo {
+
+/// The underlying memory allocator for the buffer pool that allocates buffer memory from
+/// the operating system using mmap(). All buffers are allocated through the BufferPool's
+/// SystemAllocator. The allocator only handles allocating buffers that are power-of-two
+/// multiples of the minimum buffer length.
+class SystemAllocator {
+ public:
+  SystemAllocator(int64_t min_buffer_len);
+
+  /// Allocate memory for a buffer of 'len' bytes. 'len' must be a power-of-two multiple
+  /// of the minimum buffer length.
+  Status Allocate(int64_t len, BufferPool::BufferHandle* buffer) WARN_UNUSED_RESULT;
+
+  /// Free the memory for a previously-allocated buffer.
+  void Free(BufferPool::BufferHandle&& buffer);
+
+ private:
+  /// Allocate 'len' bytes of memory for a buffer via mmap().
+  Status AllocateViaMMap(int64_t len, uint8_t** buffer_mem);
+
+  /// Allocate 'len' bytes of memory for a buffer via our malloc implementation.
+  Status AllocateViaMalloc(int64_t len, uint8_t** buffer_mem);
+
+  const int64_t min_buffer_len_;
+};
+}
+
+#endif
diff --git a/be/src/runtime/client_cache.cpp b/be/src/runtime/client_cache.cpp
index ed37112991..b183ef1eab 100644
--- a/be/src/runtime/client_cache.cpp
+++ b/be/src/runtime/client_cache.cpp
@@ -70,7 +70,7 @@ Status ClientCacheHelper::get_client(
     _client_map[*client_key]->set_recv_timeout(timeout_ms);
 
     if (_metrics_enabled) {
-        _clients_in_use_metric->increment(1);
+        _used_clients->increment(1);
     }
 
     return Status::OK;
@@ -98,7 +98,7 @@ Status ClientCacheHelper::reopen_client(client_factory factory_method, void** cl
     *client_key = NULL;
 
     if (_metrics_enabled) {
-        _total_clients_metric->increment(-1);
+        _opened_clients->increment(-1);
     }
 
     RETURN_IF_ERROR(create_client(make_network_address(
@@ -129,7 +129,7 @@ Status ClientCacheHelper::create_client(
     _client_map[*client_key] = client_impl.release();
 
     if (_metrics_enabled) {
-        _total_clients_metric->increment(1);
+        _opened_clients->increment(1);
     }
 
     return Status::OK;
@@ -149,7 +149,7 @@ void ClientCacheHelper::release_client(void** client_key) {
     j->second.push_back(*client_key);
 
     if (_metrics_enabled) {
-        _clients_in_use_metric->increment(-1);
+        _used_clients->increment(-1);
     }
 
     *client_key = NULL;
@@ -204,19 +204,21 @@ void ClientCacheHelper::test_shutdown() {
     }
 }
 
-void ClientCacheHelper::init_metrics(MetricGroup* metrics, const std::string& key_prefix) {
+void ClientCacheHelper::init_metrics(MetricRegistry* metrics, const std::string& key_prefix) {
     DCHECK(metrics != NULL);
     // Not strictly needed if init_metrics is called before any cache
     // usage, but ensures that _metrics_enabled is published.
     boost::lock_guard lock(_lock);
-    std::stringstream count_ss;
-    count_ss << key_prefix << ".client_cache.clients_in_use";
-    _clients_in_use_metric =
-        metrics->AddGauge(count_ss.str(), 0L);
 
-    std::stringstream max_ss;
-    max_ss << key_prefix << ".client_cache.total_clients";
-    _total_clients_metric = metrics->AddGauge(max_ss.str(), 0L);
+    _used_clients.reset(new IntGauge());
+    metrics->register_metric("thrift_used_clients",
+                             MetricLabels().add("name", key_prefix),
+                             _used_clients.get());
+
+    _opened_clients.reset(new IntGauge());
+    metrics->register_metric("thrift_opened_clients",
+                             MetricLabels().add("name", key_prefix),
+                             _opened_clients.get());
     _metrics_enabled = true;
 }
 
diff --git a/be/src/runtime/client_cache.h b/be/src/runtime/client_cache.h
index 0d544fa18d..b2650b0154 100644
--- a/be/src/runtime/client_cache.h
+++ b/be/src/runtime/client_cache.h
@@ -87,7 +87,7 @@ public:
 
     void test_shutdown();
 
-    void init_metrics(MetricGroup* metrics, const std::string& key_prefix);
+    void init_metrics(MetricRegistry* metrics, const std::string& key_prefix);
 
 private:
     template  friend class ClientCache;
@@ -108,14 +108,14 @@ private:
     typedef boost::unordered_map ClientMap;
     ClientMap _client_map;
 
-    // MetricGroup
+    // MetricRegistry
     bool _metrics_enabled;
 
     // Number of clients 'checked-out' from the cache
-    IntGauge* _clients_in_use_metric;
+    std::unique_ptr _used_clients;
 
     // Total clients in the cache, including those in use
-    IntGauge* _total_clients_metric;
+    std::unique_ptr _opened_clients;
 
     // Create a new client for specific host/port in 'client' and put it in _client_map
     Status create_client(const TNetworkAddress& hostport, client_factory factory_method,
@@ -216,11 +216,11 @@ public:
         return _client_cache_helper.test_shutdown();
     }
 
-    // Adds metrics for this cache to the supplied MetricGroup instance. The
+    // Adds metrics for this cache to the supplied MetricRegistry instance. The
     // metrics have keys that are prefixed by the key_prefix argument
     // (which should not end in a period).
     // Must be called before the cache is used, otherwise the metrics might be wrong
-    void init_metrics(MetricGroup* metrics, const std::string& key_prefix) {
+    void init_metrics(MetricRegistry* metrics, const std::string& key_prefix) {
         _client_cache_helper.init_metrics(metrics, key_prefix);
     }
 
diff --git a/be/src/runtime/data_spliter.cpp b/be/src/runtime/data_spliter.cpp
index bed04f85d2..c0622b7e83 100644
--- a/be/src/runtime/data_spliter.cpp
+++ b/be/src/runtime/data_spliter.cpp
@@ -321,7 +321,8 @@ Status DataSpliter::close(RuntimeState* state, Status close_status) {
             err_status = status;
         }
     }
-
+  
+    _expr_mem_tracker->close();
     _closed = true;
     if (is_ok) {
         return Status::OK;
diff --git a/be/src/runtime/data_stream_mgr.cpp b/be/src/runtime/data_stream_mgr.cpp
index d93230bfb7..a0a3aa738a 100644
--- a/be/src/runtime/data_stream_mgr.cpp
+++ b/be/src/runtime/data_stream_mgr.cpp
@@ -34,6 +34,7 @@
 #include "rpc/comm.h"
 #include "rpc/comm_buf.h"
 
+#include "gen_cpp/types.pb.h" // PUniqueId
 #include "gen_cpp/BackendService.h"
 #include "gen_cpp/PaloInternalService_types.h"
 
@@ -120,6 +121,31 @@ Status DataStreamMgr::add_data(
     return Status::OK;
 }
 
+Status DataStreamMgr::add_data(
+        const PUniqueId& finst_id, int32_t node_id,
+        const PRowBatch& pb_batch, int32_t sender_id,
+        int be_number, int64_t packet_seq,
+        ::google::protobuf::Closure** done) {
+    VLOG_ROW << "add_data(): finst_id=" << print_id(finst_id)
+            << " node=" << node_id;
+    TUniqueId t_finst_id;
+    t_finst_id.hi = finst_id.hi();
+    t_finst_id.lo = finst_id.lo();
+    shared_ptr recvr = find_recvr(t_finst_id, node_id);
+    if (recvr == NULL) {
+        // The receiver may remove itself from the receiver map via deregister_recvr()
+        // at any time without considering the remaining number of senders.
+        // As a consequence, find_recvr() may return an innocuous NULL if a thread
+        // calling deregister_recvr() beat the thread calling find_recvr()
+        // in acquiring _lock.
+        // TODO: Rethink the lifecycle of DataStreamRecvr to distinguish
+        // errors from receiver-initiated teardowns.
+        return Status::OK;
+    }
+    recvr->add_batch(pb_batch, sender_id, be_number, packet_seq, done);
+    return Status::OK;
+}
+
 Status DataStreamMgr::close_sender(const TUniqueId& fragment_instance_id,
                                    PlanNodeId dest_node_id,
                                    int sender_id, 
diff --git a/be/src/runtime/data_stream_mgr.h b/be/src/runtime/data_stream_mgr.h
index bfc29eace9..d1ef74526c 100644
--- a/be/src/runtime/data_stream_mgr.h
+++ b/be/src/runtime/data_stream_mgr.h
@@ -38,6 +38,12 @@
 
 #include "rpc/inet_addr.h"
 
+namespace google {
+namespace protobuf {
+class Closure;
+}
+}
+
 namespace palo {
 
 class DescriptorTbl;
@@ -47,7 +53,9 @@ class RuntimeState;
 class TRowBatch;
 class Comm;
 class CommBuf;
+class PRowBatch;
 typedef std::shared_ptr CommBufPtr;
+class PUniqueId;
 
 // Singleton class which manages all incoming data streams at a backend node. It
 // provides both producer and consumer functionality for each data stream.
@@ -91,9 +99,11 @@ public:
     Status add_data(const TUniqueId& fragment_instance_id, PlanNodeId dest_node_id,
             const TRowBatch& thrift_batch, int sender_id, bool* buffer_overflow,
                     std::pair response);
-    // Status add_data(const TUniqueId& fragment_instance_id, PlanNodeId dest_node_id,
-    //                 const TRowBatch& thrift_batch, bool* buffer_overflow,
-    //                 std::pair response);
+
+    Status add_data(const PUniqueId& fragment_instance_id, int32_t node_id,
+                    const PRowBatch& pb_batch, int32_t sender_id,
+                    int32_t be_number, int64_t packet_seq,
+                    ::google::protobuf::Closure** done);
 
     // Notifies the recvr associated with the fragment/node id that the specified
     // sender has closed.
diff --git a/be/src/runtime/data_stream_recvr.cc b/be/src/runtime/data_stream_recvr.cc
index 0dc868ba9e..0b7345f864 100644
--- a/be/src/runtime/data_stream_recvr.cc
+++ b/be/src/runtime/data_stream_recvr.cc
@@ -22,10 +22,14 @@
 
 #include 
 #include 
+#include 
 
 #include 
 #include 
+#include 
 
+#include "gen_cpp/data.pb.h"
+#include "rpc/comm.h"
 #include "runtime/data_stream_mgr.h"
 #include "runtime/row_batch.h"
 #include "runtime/sorted_run_merger.h"
@@ -75,6 +79,11 @@ public:
             bool* is_buf_overflow,
             std::pair response);
 
+    void add_batch(
+        const PRowBatch& pb_batch,
+        int be_number, int64_t packet_seq,
+        ::google::protobuf::Closure** done);
+
     // Decrement the number of remaining senders for this queue and signal eos ("new data")
     // if the count drops to 0. The number of senders will be 1 for a merging
     // DataStreamRecvr.
@@ -128,9 +137,10 @@ private:
     std::unordered_set _sender_eos_set; // sender_id
     std::unordered_map _packet_seq_map; // be_number => packet_seq
 
-    boost::mutex _response_lock;
     typedef std::list> ResponseQueue;
     ResponseQueue _response_queue;
+
+    std::deque _pending_closures;
 };
 
 DataStreamRecvr::SenderQueue::SenderQueue(
@@ -179,7 +189,6 @@ Status DataStreamRecvr::SenderQueue::get_batch(RowBatch** next_batch) {
     *next_batch = _current_batch.get();
 
     {
-        boost::unique_lock response_lock(_response_lock);
         Comm* comm = Comm::instance();
         if (!_response_queue.empty()) {
             std::pair response = _response_queue.front();
@@ -187,6 +196,11 @@ Status DataStreamRecvr::SenderQueue::get_batch(RowBatch** next_batch) {
             _response_queue.pop_front();
         }
     }
+    if (!_pending_closures.empty()) {
+        auto done = _pending_closures.front();
+        done->Run();
+        _pending_closures.pop_front();
+    }
 
     return Status::OK;
 }
@@ -253,9 +267,8 @@ void DataStreamRecvr::SenderQueue::add_batch(const TRowBatch& thrift_batch,
         << " batch_size=" << batch_size << "\n";
     _batch_queue.push_back(make_pair(batch_size, batch));
 
-    if (_batch_queue.empty() || _recvr->exceeds_limit(batch_size)) {
+    if (_recvr->exceeds_limit(batch_size)) {
         *is_buf_overflow = true;
-        boost::unique_lock response_lock(_response_lock);
         _response_queue.push_back(response);
     }
 
@@ -263,19 +276,84 @@ void DataStreamRecvr::SenderQueue::add_batch(const TRowBatch& thrift_batch,
     _data_arrival_cv.notify_one();
 }
 
+void DataStreamRecvr::SenderQueue::add_batch(
+        const PRowBatch& pb_batch,
+        int be_number, int64_t packet_seq,
+        ::google::protobuf::Closure** done) {
+    unique_lock l(_lock);
+    if (_is_cancelled) {
+        return;
+    }
+    auto iter = _packet_seq_map.find(be_number);
+    if (iter != _packet_seq_map.end()) {
+        if (iter->second >= packet_seq) {
+            LOG(WARNING) << "packet already exist [cur_packet_id= " << iter->second
+                         << " receive_packet_id=" << packet_seq << "]";
+            return;
+        }
+        iter->second = packet_seq;
+    } else {
+        _packet_seq_map.emplace(be_number, packet_seq);
+    }
+
+    int batch_size = RowBatch::get_batch_size(pb_batch);
+    COUNTER_UPDATE(_recvr->_bytes_received_counter, batch_size);
+
+    // Following situation will match the following condition.
+    // Sender send a packet failed, then close the channel.
+    // but closed packet reach first, then the failed packet.
+    // Then meet the assert
+    // we remove the assert
+    // DCHECK_GT(_num_remaining_senders, 0);
+    if (_num_remaining_senders <= 0) {
+        DCHECK(_sender_eos_set.end() != _sender_eos_set.find(be_number));
+        return;
+    }
+
+    // We always accept the batch regardless of buffer limit, to avoid rpc pipeline stall.
+    // If exceed buffer limit, we just do not respoinse ACK to client, so the client won't
+    // send data until receive ACK.
+    // Note that if this be needs to receive data from N BEs, the size of buffer
+    // may reach as many as (buffer_size + n * buffer_size)
+    //
+    // Note: It's important that we enqueue thrift_batch regardless of buffer limit if
+    //  the queue is currently empty. In the case of a merging receiver, batches are
+    //  received from a specific queue based on data order, and the pipeline will stall
+    //  if the merger is waiting for data from an empty queue that cannot be filled
+    //  because the limit has been reached.
+    if (_is_cancelled) {
+        return;
+    }
+
+    RowBatch* batch = NULL;
+    {
+        SCOPED_TIMER(_recvr->_deserialize_row_batch_timer);
+        // Note: if this function makes a row batch, the batch *must* be added
+        // to _batch_queue. It is not valid to create the row batch and destroy
+        // it in this thread.
+        batch = new RowBatch(_recvr->row_desc(), pb_batch, _recvr->mem_tracker());
+    }
+    VLOG_ROW << "added #rows=" << batch->num_rows()
+        << " batch_size=" << batch_size << "\n";
+    _batch_queue.emplace_back(batch_size, batch);
+    // if done is nullptr, this function can't delay this response
+    if (done != nullptr && _recvr->exceeds_limit(batch_size)) {
+        DCHECK(*done != nullptr);
+        _pending_closures.push_back(*done);
+        *done = nullptr;
+    }
+    _recvr->_num_buffered_bytes += batch_size;
+    _data_arrival_cv.notify_one();
+}
+
 void DataStreamRecvr::SenderQueue::decrement_senders(int be_number) {
     lock_guard l(_lock);
-
     if (_sender_eos_set.end() != _sender_eos_set.find(be_number)) {
-        // already closed
         return;
     }
     _sender_eos_set.insert(be_number);
-
-    DCHECK_GT(_num_remaining_senders, 0) << " trace: " << std::endl << get_stack_trace();
-    _num_remaining_senders = std::max(0, _num_remaining_senders - 1);
-
-    VLOG_FILE << _recvr->fragment_instance_id();
+    DCHECK_GT(_num_remaining_senders, 0);
+    _num_remaining_senders--;
     VLOG_FILE << "decremented senders: fragment_instance_id="
         << _recvr->fragment_instance_id()
         << " node_id=" << _recvr->dest_node_id()
@@ -304,13 +382,18 @@ void DataStreamRecvr::SenderQueue::cancel() {
     //         _recvr->_bytes_received_time_series_counter);
 
     {
-        boost::lock_guard response_lock(_response_lock);
+        boost::lock_guard l(_lock);
         Comm* comm = Comm::instance();
         while (!_response_queue.empty()) {
             std::pair response = _response_queue.front();
             comm->send_response(response.first, response.second);
             _response_queue.pop_front();
         }
+
+        for (auto done : _pending_closures) {
+            done->Run();
+        }
+        _pending_closures.clear();
     }
 }
 
@@ -406,6 +489,15 @@ void DataStreamRecvr::add_batch(
     _sender_queues[use_sender_id]->add_batch(thrift_batch, is_buf_overflow, response);
 }
 
+void DataStreamRecvr::add_batch(
+        const PRowBatch& batch, int sender_id,
+        int be_number, int64_t packet_seq,
+        ::google::protobuf::Closure** done) {
+    int use_sender_id = _is_merging ? sender_id : 0;
+    // Add all batches to the same queue if _is_merging is false.
+    _sender_queues[use_sender_id]->add_batch(batch, be_number, packet_seq, done);
+}
+
 void DataStreamRecvr::remove_sender(int sender_id, int be_number) {
     int use_sender_id = _is_merging ? sender_id : 0;
     _sender_queues[use_sender_id]->decrement_senders(be_number);
@@ -426,6 +518,7 @@ void DataStreamRecvr::close() {
     _mgr->deregister_recvr(fragment_instance_id(), dest_node_id());
     _mgr = NULL;
     _merger.reset();
+    _mem_tracker->close();
     _mem_tracker->unregister_from_parent();
     _mem_tracker.reset();
 }
diff --git a/be/src/runtime/data_stream_recvr.h b/be/src/runtime/data_stream_recvr.h
index db00789f3f..2a5a7aa7e9 100644
--- a/be/src/runtime/data_stream_recvr.h
+++ b/be/src/runtime/data_stream_recvr.h
@@ -32,6 +32,12 @@
 #include "util/tuple_row_compare.h"
 #include "rpc/inet_addr.h"
 
+namespace google {
+namespace protobuf {
+class Closure;
+}
+}
+
 namespace palo {
 
 class DataStreamMgr;
@@ -39,6 +45,7 @@ class SortedRunMerger;
 class MemTracker;
 class RowBatch;
 class RuntimeProfile;
+class PRowBatch;
 
 class Comm;
 class CommBuf;
@@ -115,6 +122,11 @@ private:
     void add_batch(const TRowBatch& thrift_batch, int sender_id,
                    bool* is_buf_overflow, std::pair response);
 
+    // If receive queue is full, done is enqueue pending, and return with *done is nullptr
+    void add_batch(const PRowBatch& batch, int sender_id,
+                   int be_number, int64_t packet_seq,
+                   ::google::protobuf::Closure** done);
+
     // Indicate that a particular sender is done. Delegated to the appropriate
     // sender queue. Called from DataStreamMgr.
     void remove_sender(int sender_id, int be_number);
diff --git a/be/src/runtime/data_stream_sender.cpp b/be/src/runtime/data_stream_sender.cpp
index 39071dbcfc..539e729d42 100644
--- a/be/src/runtime/data_stream_sender.cpp
+++ b/be/src/runtime/data_stream_sender.cpp
@@ -28,6 +28,7 @@
 #include "common/logging.h"
 #include "exprs/expr.h"
 #include "runtime/descriptors.h"
+#include "runtime/exec_env.h"
 #include "runtime/tuple_row.h"
 #include "runtime/row_batch.h"
 #include "runtime/raw_value.h"
@@ -43,7 +44,9 @@
 #include "gen_cpp/Types_types.h"
 #include "gen_cpp/PaloInternalService_types.h"
 #include "gen_cpp/BackendService.h"
+#include "gen_cpp/internal_service.pb.h"
 
+#include "rpc/connection_manager.h"
 #include "rpc/dispatch_handler_synchronizer.h"
 #include "rpc/event.h"
 #include "rpc/protocol.h"
@@ -51,9 +54,35 @@
 #include "rpc/serialization.h"
 #include 
 
+#include "service/brpc.h"
+
 #include "util/thrift_util.h"
+#include "util/rpc_channel.h"
+#include "util/brpc_stub_cache.h"
 
 namespace palo {
+ 
+class TransmitDataClosure : public google::protobuf::Closure {
+public:
+    TransmitDataClosure() : _refs(0) { }
+    ~TransmitDataClosure() { }
+
+    void ref() { _refs.fetch_add(1, std::memory_order_relaxed); }
+
+    // If unref() returns true, this object should be delete
+    bool unref() { return _refs.fetch_sub(1, std::memory_order_relaxed) == 1; }
+
+    void Run() override {
+        if (unref()) {
+            delete this;
+        }
+    }
+
+    brpc::Controller cntl;
+    PTransmitDataResult result;
+private:
+    std::atomic _refs;
+};
 
 // A channel sends data asynchronously via calls to transmit_data
 // to a single destination ipaddress/node.
@@ -63,14 +92,16 @@ namespace palo {
 // at any one time (ie, sending will block if the most recent rpc hasn't finished,
 // which allows the receiver node to throttle the sender by withholding acks).
 // *Not* thread-safe.
-class DataStreamSender::Channel : public DispatchHandler {
+class DataStreamSender::Channel {
 public:
     // Create channel to send data to particular ipaddress/port/query/node
     // combination. buffer_size is specified in bytes and a soft limit on
     // how much tuple data is getting accumulated before being sent; it only applies
     // when data is added via add_row() and not sent directly via send_batch().
     Channel(DataStreamSender* parent, const RowDescriptor& row_desc,
-            const TNetworkAddress& destination, const TUniqueId& fragment_instance_id,
+            const TNetworkAddress& destination, 
+            const TNetworkAddress& brpc_dest,
+            const TUniqueId& fragment_instance_id,
             PlanNodeId dest_node_id, int buffer_size) :
         _parent(parent),
         _buffer_size(buffer_size),
@@ -79,19 +110,19 @@ public:
         _dest_node_id(dest_node_id),
         _num_data_bytes_sent(0),
         _packet_seq(0),
-        _rpc_in_flight(false),
-        _is_closed(false),
-        _thrift_serializer(false, 1024) {
-
-        _comm = Comm::instance();
-
-        // Initialize InetAddr
-        struct sockaddr_in sockaddr_in;
-        InetAddr::initialize(&sockaddr_in, destination.hostname.c_str(), destination.port);
-        _addr.set_inet(sockaddr_in);
+        _need_close(false),
+        _thrift_serializer(false, 1024),
+        _dest_addr(destination),
+        _brpc_dest_addr(brpc_dest) {
     }
 
-    virtual ~Channel() { }
+    virtual ~Channel() {
+        if (_closure != nullptr && _closure->unref()) {
+            delete _closure;
+        }
+        // release this before request desctruct
+        _brpc_request.release_finst_id();
+    }
 
     // Initialize channel.
     // Returns OK if successful, error indication otherwise.
@@ -107,14 +138,12 @@ public:
     // rpc (or OK if there wasn't one that hasn't been reported yet).
     // if batch is nullptr, send the eof packet
     Status send_batch(TRowBatch* batch);
+    Status send_batch(PRowBatch* batch, bool eos = false);
 
     // Flush buffered rows and close channel.
     // Returns error status if any of the preceding rpcs failed, OK otherwise.
     void close(RuntimeState* state);
 
-    // Called when event has happened
-    void on_event(EventPtr& event);
-
     int64_t num_data_bytes_sent() const {
         return _num_data_bytes_sent;
     }
@@ -122,20 +151,31 @@ public:
     TRowBatch* thrift_batch() { 
         return &_thrift_batch;
     }
-
-    // DispatchHandler handle, used to handle request event
-    void handle(EventPtr &event_ptr) override;
+    PRowBatch* pb_batch() { 
+        return &_pb_batch;
+    }
+    bool use_brpc() const {
+        return _brpc_stub != nullptr;
+    }
+
+private:
+    inline Status _wait_last_brpc() {
+        auto cntl = &_closure->cntl;
+        brpc::Join(cntl->call_id());
+        if (cntl->Failed()) {
+            LOG(WARNING) << "failed to send brpc batch, error=" << berror(cntl->ErrorCode())
+                << ", error_text=" << cntl->ErrorText();
+            return Status("failed to send batch");
+        }
+        return Status::OK;
+    }
+
 
 private:
-    // finish last send, this function may retry last sent if there is error when wait
-    // for response
-    Status _finish_last_sent();
     // Serialize _batch into _thrift_batch and send via send_batch().
     // Returns send_batch() status.
-    Status send_current_batch();
+    Status send_current_batch(bool eos = false);
     Status close_internal();
-    // send message to remote, this function will reopen connect in ConnectionManager
-    Status _send_message();
 
     DataStreamSender* _parent;
     int _buffer_size;
@@ -152,36 +192,25 @@ private:
     boost::scoped_ptr _batch;
     TRowBatch _thrift_batch;
 
-    // We want to reuse the rpc thread to prevent creating a thread per rowbatch.
-    // TODO: currently we only have one batch in flight, but we should buffer more
-    // batches. This is a bit tricky since the channels share the outgoing batch
-    // pointer we need some mechanism to coordinate when the batch is all done.
-    // TODO: if the order of row batches does not matter, we can consider increasing
-    // the number of threads.
-    bool _rpc_in_flight;  // true if the rpc in sending.
-
-    Status _rpc_status;  // status of most recently finished transmit_data rpc
-
-    bool _is_closed;
+    bool _need_close;
     int _be_number;
 
-    CommAddress _addr;
-    CommBufPtr _cbp;
-    Comm* _comm;
-    ConnectionManagerPtr _conn_mgr;
-
     ThriftSerializer _thrift_serializer;
 
-    // lock, protect variables
-    std::mutex _lock;
-    std::condition_variable _cond;
-    std::deque _events;
-
-    uint8_t* _serialized_buf = nullptr;
-    uint32_t _serialized_buf_bytes = 0;
-
+    TNetworkAddress _dest_addr;
+    TNetworkAddress _brpc_dest_addr;
     uint32_t _connect_timeout_ms = 500;
     uint32_t _rpc_timeout_ms = 1000;
+
+    std::shared_ptr _rpc_channel;
+
+    // TODO(zc): initused for brpc
+    PUniqueId _finst_id;
+    PRowBatch _pb_batch;
+    PTransmitDataParams _brpc_request;
+    PInternalService_Stub* _brpc_stub = nullptr;
+    TransmitDataClosure* _closure = nullptr;
+    int32_t _brpc_timeout_ms = 500;
 };
 
 Status DataStreamSender::Channel::init(RuntimeState* state) {
@@ -191,19 +220,34 @@ Status DataStreamSender::Channel::init(RuntimeState* state) {
     int capacity = std::max(1, _buffer_size / std::max(_row_desc.get_row_size(), 1));
     _batch.reset(new RowBatch(_row_desc, capacity, _parent->_mem_tracker.get()));
 
-    _conn_mgr = state->exec_env()->get_conn_manager();
-    _conn_mgr->add(_addr, _connect_timeout_ms, NULL);
-    // One hour is max rpc timeout
-    _rpc_timeout_ms = std::min(3600, std::max(1, state->query_options().query_timeout / 2)) * 1000;
+    {
+        // One hour is max rpc timeout
+        _rpc_timeout_ms = std::min(3600, std::max(1, state->query_options().query_timeout / 2)) * 1000;
+
+        _rpc_channel = std::make_shared(
+            Comm::instance(), state->exec_env()->get_conn_manager(), 0);
+        RETURN_IF_ERROR(_rpc_channel->init(_dest_addr.hostname, _dest_addr.port,
+                                           _connect_timeout_ms, _rpc_timeout_ms));
+    }
+    if (!_brpc_dest_addr.hostname.empty()) {
+        // initialize brpc request
+        _finst_id.set_hi(_fragment_instance_id.hi);
+        _finst_id.set_lo(_fragment_instance_id.lo);
+        _brpc_request.set_allocated_finst_id(&_finst_id);
+        _brpc_request.set_node_id(_dest_node_id);
+        _brpc_request.set_sender_id(_parent->_sender_id);
+        _brpc_request.set_be_number(_be_number);
+
+        _brpc_timeout_ms = std::min(3600, state->query_options().query_timeout) * 1000;
+        _brpc_stub = state->exec_env()->brpc_stub_cache()->get_stub(_brpc_dest_addr);
+    }
+    _need_close = true;
     return Status::OK;
 }
 
 Status DataStreamSender::Channel::send_batch(TRowBatch* batch) {
     VLOG_ROW << "Channel::send_batch() instance_id=" << _fragment_instance_id
              << " dest_node=" << _dest_node_id;
-
-    RETURN_IF_ERROR(_finish_last_sent());
-
     TTransmitDataParams params;
     params.protocol_version = PaloInternalServiceVersion::V1;
     params.__set_dest_fragment_instance_id(_fragment_instance_id);
@@ -222,119 +266,36 @@ Status DataStreamSender::Channel::send_batch(TRowBatch* batch) {
         params.__set_eos(true);
     }
 
-    _thrift_serializer.serialize(¶ms, &_serialized_buf_bytes, &_serialized_buf);
+    uint8_t* serialized_buf = nullptr;
+    uint32_t serialized_buf_bytes = 0;
+    _thrift_serializer.serialize(¶ms, &serialized_buf_bytes, &serialized_buf);
 
-    return _send_message();
+    return _rpc_channel->send_message(serialized_buf, serialized_buf_bytes);
 }
 
-void DataStreamSender::Channel::handle(EventPtr& event) {
-    {
-        std::lock_guard l(_lock);
-        _events.push_back(event);
+Status DataStreamSender::Channel::send_batch(PRowBatch* batch, bool eos) {
+    if (_closure == nullptr) {
+        _closure = new TransmitDataClosure();
+        _closure->ref();
+    } else {
+        RETURN_IF_ERROR(_wait_last_brpc());
+        _closure->cntl.Reset();
     }
-    _cond.notify_one();
-}
+    VLOG_ROW << "Channel::send_batch() instance_id=" << _fragment_instance_id
+             << " dest_node=" << _dest_node_id;
 
-Status DataStreamSender::Channel::_finish_last_sent() {
-    if (!_rpc_in_flight) {
-        return _rpc_status;
+    _brpc_request.set_eos(eos);
+    if (batch != nullptr) {
+        _brpc_request.set_allocated_row_batch(batch);
     }
-    int retry_times = 1;
-    while (true) {
-        EventPtr event;
-        {
-            std::unique_lock l(_lock);
-            auto duration = std::chrono::milliseconds(2 * _rpc_timeout_ms);
-            if (_cond.wait_for(l, duration, [this]() { return !this->_events.empty(); })) {
-                event = _events.front();
-                _events.pop_front();
-            }
-        }
-        if (event == nullptr) {
-            LOG(WARNING) << "it's so weird, wait reponse event timeout, request="
-                << _cbp->header.id << ", addr=" << _addr.to_str();
-            _rpc_in_flight = false;
-            if (retry_times-- > 0) {
-                // timeout to receive response
-                RETURN_IF_ERROR(_send_message());
-            } else {
-                LOG(WARNING) << "fail to send batch, _add=" << _addr.to_str()
-                    << ", request_id="<< _cbp->header.id;
-                _rpc_status = Status(TStatusCode::THRIFT_RPC_ERROR, "fail to send batch");
-                break;
-            }
-            continue;
-        }
-        if (event->type == Event::MESSAGE) {
-            if (event->header.id != _cbp->header.id) {
-                LOG(WARNING) << "receive event id not equal with in-flight request, request_id="
-                    << _cbp->header.id << ", event=" << event->to_str();
-                continue;
-            }
-            // response recept
-            _rpc_in_flight = false;
-            return Status::OK;
-        } else if (event->type == Event::DISCONNECT || event->type == Event::ERROR) {
-            if (event->header.id != 0 && event->header.id != _cbp->header.id) {
-                LOG(WARNING) << "receive event id not equal with in-flight request, request_id="
-                    << _cbp->header.id << ", event=" << event->to_str();
-                continue;
-            }
-            LOG(WARNING) << "receive response failed, request_id=" << _cbp->header.id
-                << ", event=" << event->to_str();
-            _rpc_in_flight = false;
-            // error happend when receving response, we need to retry last request
-            if (retry_times-- > 0) {
-                // timeout to receive response
-                RETURN_IF_ERROR(_send_message());
-            } else {
-                LOG(WARNING) << "fail to send batch, request_id="<< _cbp->header.id
-                    << ", event=" << event->to_str();
-                _rpc_status = Status(TStatusCode::THRIFT_RPC_ERROR, "fail to send batch");
-                break;
-            }
-        } else {
-            _rpc_in_flight = false;
-            LOG(ERROR) << "recevie unexpect event, event=" << event->to_str();
-            _rpc_status = Status(TStatusCode::THRIFT_RPC_ERROR, "fail to send batch");
-            break;
-        }
+    _brpc_request.set_packet_seq(_packet_seq++);
+
+    _closure->ref();
+    _closure->cntl.set_timeout_ms(_brpc_timeout_ms);
+    _brpc_stub->transmit_data(&_closure->cntl, &_brpc_request, &_closure->result, _closure);
+    if (batch != nullptr) {
+        _brpc_request.release_row_batch();
     }
-
-    return _rpc_status;
-}
-
-Status DataStreamSender::Channel::_send_message() {
-    DCHECK(!_rpc_in_flight);
-
-    CommHeader header;
-    CommBufPtr new_comm_buf = std::make_shared(header, _serialized_buf_bytes);
-    new_comm_buf->append_bytes(_serialized_buf, _serialized_buf_bytes);
-
-    auto res = _comm->send_request(_addr, _rpc_timeout_ms, new_comm_buf, this);
-    if (res != error::OK) {
-        LOG(WARNING) << "fail to send_request, addr=" << _addr.to_str()
-            << ", res=" << res << ", message=" << error::get_text(res);
-        // sleep 10ms to wait ConnectionManager to be notify
-        usleep(10 * 1000);
-        _conn_mgr->add(_addr, _connect_timeout_ms, "PaloBeDataStreamMgr");
-        bool is_connected = _conn_mgr->wait_for_connection(_addr, _connect_timeout_ms);
-        if (!is_connected) {
-            LOG(WARNING) << "fail to wait_for_connection, addr=" << _addr.to_str();
-            _conn_mgr->remove(_addr);
-            _rpc_status = Status(TStatusCode::THRIFT_RPC_ERROR, "connection to remote PaloBe failed");
-            return _rpc_status;
-        }
-        res = _comm->send_request(_addr, _rpc_timeout_ms, new_comm_buf, this);
-        if (res != error::OK) {
-            LOG(WARNING) << "fail to send_request, addr=" << _addr.to_str()
-                << ", res=" << res << ", message=" << error::get_text(res);
-            _rpc_status = Status(TStatusCode::THRIFT_RPC_ERROR, "fail to send_request");
-            return _rpc_status;
-        }
-    }
-    _cbp = new_comm_buf;
-    _rpc_in_flight = true;
     return Status::OK;
 }
 
@@ -366,32 +327,51 @@ Status DataStreamSender::Channel::add_row(TupleRow* row) {
     return Status::OK;
 }
 
-Status DataStreamSender::Channel::send_current_batch() {
-    {
-        SCOPED_TIMER(_parent->_serialize_batch_timer);
-        int uncompressed_bytes = _batch->serialize(&_thrift_batch);
-        COUNTER_UPDATE(_parent->_bytes_sent_counter, RowBatch::get_batch_size(_thrift_batch));
-        COUNTER_UPDATE(_parent->_uncompressed_bytes_counter, uncompressed_bytes);
+Status DataStreamSender::Channel::send_current_batch(bool eos) {
+    if (use_brpc()) {
+        {
+            SCOPED_TIMER(_parent->_serialize_batch_timer);
+            int uncompressed_bytes = _batch->serialize(&_pb_batch);
+            COUNTER_UPDATE(_parent->_bytes_sent_counter, RowBatch::get_batch_size(_pb_batch));
+            COUNTER_UPDATE(_parent->_uncompressed_bytes_counter, uncompressed_bytes);
+        }
+        _batch->reset();
+        RETURN_IF_ERROR(send_batch(&_pb_batch, eos));
+    } else {
+        {
+            SCOPED_TIMER(_parent->_serialize_batch_timer);
+            int uncompressed_bytes = _batch->serialize(&_thrift_batch);
+            COUNTER_UPDATE(_parent->_bytes_sent_counter, RowBatch::get_batch_size(_thrift_batch));
+            COUNTER_UPDATE(_parent->_uncompressed_bytes_counter, uncompressed_bytes);
+        }
+        _batch->reset();
+        RETURN_IF_ERROR(send_batch(&_thrift_batch));
     }
-    _batch->reset();
-    RETURN_IF_ERROR(send_batch(&_thrift_batch));
     return Status::OK;
 }
 
 Status DataStreamSender::Channel::close_internal() {
-    if (_is_closed) {
+    if (!_need_close) {
         return Status::OK;
     }
     VLOG_RPC << "Channel::close() instance_id=" << _fragment_instance_id
              << " dest_node=" << _dest_node_id
              << " #rows= " << _batch->num_rows();
-    if (_batch != NULL && _batch->num_rows() > 0) {
-        RETURN_IF_ERROR(send_current_batch());
+    if (use_brpc()) {
+        if (_batch != NULL && _batch->num_rows() > 0) {
+            RETURN_IF_ERROR(send_current_batch(true));
+        } else {
+            RETURN_IF_ERROR(send_batch(nullptr, true));
+        }
+        RETURN_IF_ERROR(_wait_last_brpc());
+    } else {
+        if (_batch != NULL && _batch->num_rows() > 0) {
+            RETURN_IF_ERROR(send_current_batch());
+        }
+        RETURN_IF_ERROR(send_batch((TRowBatch*)nullptr));
+        RETURN_IF_ERROR(_rpc_channel->wait_last_sent());
     }
-
-    RETURN_IF_ERROR(send_batch(nullptr));
-    RETURN_IF_ERROR(_finish_last_sent());
-    _is_closed = true;
+    _need_close = false;
     return Status::OK;
 }
 
@@ -412,6 +392,7 @@ DataStreamSender::DataStreamSender(
         _part_type(sink.output_partition.type),
         _ignore_not_found(sink.__isset.ignore_not_found ? sink.ignore_not_found : true),
         _current_thrift_batch(&_thrift_batch1),
+        _current_pb_batch(&_pb_batch1),
         _profile(NULL),
         _serialize_batch_timer(NULL),
         _thrift_transmit_timer(NULL),
@@ -426,6 +407,7 @@ DataStreamSender::DataStreamSender(
     for (int i = 0; i < destinations.size(); ++i) {
         _channel_shared_ptrs.emplace_back(
             new Channel(this, row_desc, destinations[i].server,
+                        destinations[i].brpc_server,
                         destinations[i].fragment_instance_id,
                         sink.dest_node_id, per_channel_buffer_size));
         _channels.push_back(_channel_shared_ptrs[i].get());
@@ -512,8 +494,12 @@ Status DataStreamSender::prepare(RuntimeState* state) {
         boost::bind(&RuntimeProfile::units_per_second, _bytes_sent_counter,
                                              profile()->total_time_counter()), "");
 
+    _use_brpc = true;
     for (int i = 0; i < _channels.size(); ++i) {
         RETURN_IF_ERROR(_channels[i]->init(state));
+        if (_use_brpc && !_channels[i]->use_brpc()) {
+            _use_brpc = false;
+        }
     }
 
     return Status::OK;
@@ -539,23 +525,40 @@ Status DataStreamSender::send(RuntimeState* state, RowBatch* batch) {
 
     // Unpartition or _channel size
     if (_part_type == TPartitionType::UNPARTITIONED || _channels.size() == 1) {
-        // _current_thrift_batch is *not* the one that was written by the last call
-        // to Serialize()
-        RETURN_IF_ERROR(serialize_batch(batch, _current_thrift_batch, _channels.size()));
-        // SendBatch() will block if there are still in-flight rpcs (and those will
-        // reference the previously written thrift batch)
-        for (int i = 0; i < _channels.size(); ++i) {
-            RETURN_IF_ERROR(_channels[i]->send_batch(_current_thrift_batch));
+        if (_use_brpc) {
+            RETURN_IF_ERROR(serialize_batch(batch, _current_pb_batch, _channels.size()));
+            for (auto channel : _channels) {
+                RETURN_IF_ERROR(channel->send_batch(_current_pb_batch));
+            }
+            _current_pb_batch = (_current_pb_batch == &_pb_batch1 ? &_pb_batch2 : &_pb_batch1);
+        } else {
+            // _current_thrift_batch is *not* the one that was written by the last call
+            // to Serialize()
+            RETURN_IF_ERROR(serialize_batch(batch, _current_thrift_batch, _channels.size()));
+            // SendBatch() will block if there are still in-flight rpcs (and those will
+            // reference the previously written thrift batch)
+            for (int i = 0; i < _channels.size(); ++i) {
+                RETURN_IF_ERROR(_channels[i]->send_batch(_current_thrift_batch));
+            }
+            _current_thrift_batch =
+                (_current_thrift_batch == &_thrift_batch1 ? &_thrift_batch2 : &_thrift_batch1);
         }
-        _current_thrift_batch =
-            (_current_thrift_batch == &_thrift_batch1 ? &_thrift_batch2 : &_thrift_batch1);
     } else if (_part_type == TPartitionType::RANDOM) {
-        // Round-robin batches among channels. Wait for the current channel to finish its
-        // rpc before overwriting its batch.
-        Channel* current_channel = _channels[_current_channel_idx];
-        RETURN_IF_ERROR(serialize_batch(batch, current_channel->thrift_batch()));
-        RETURN_IF_ERROR(current_channel->send_batch(current_channel->thrift_batch()));
-        _current_channel_idx = (_current_channel_idx + 1) % _channels.size();
+        if (_use_brpc) {
+            // Round-robin batches among channels. Wait for the current channel to finish its
+            // rpc before overwriting its batch.
+            Channel* current_channel = _channels[_current_channel_idx];
+            RETURN_IF_ERROR(serialize_batch(batch, current_channel->pb_batch()));
+            RETURN_IF_ERROR(current_channel->send_batch(current_channel->pb_batch()));
+            _current_channel_idx = (_current_channel_idx + 1) % _channels.size();
+        } else {
+            // Round-robin batches among channels. Wait for the current channel to finish its
+            // rpc before overwriting its batch.
+            Channel* current_channel = _channels[_current_channel_idx];
+            RETURN_IF_ERROR(serialize_batch(batch, current_channel->thrift_batch()));
+            RETURN_IF_ERROR(current_channel->send_batch(current_channel->thrift_batch()));
+            _current_channel_idx = (_current_channel_idx + 1) % _channels.size();
+        }
     } else if (_part_type == TPartitionType::HASH_PARTITIONED) {
         // hash-partition batch's rows across channels
         int num_channels = _channels.size();
@@ -710,7 +713,8 @@ Status DataStreamSender::close(RuntimeState* state, Status exec_status) {
     return Status::OK;
 }
 
-Status DataStreamSender::serialize_batch(RowBatch* src, TRowBatch* dest, int num_receivers) {
+template
+Status DataStreamSender::serialize_batch(RowBatch* src, T* dest, int num_receivers) {
     VLOG_ROW << "serializing " << src->num_rows() << " rows";
     {
         // TODO(zc)
@@ -724,7 +728,6 @@ Status DataStreamSender::serialize_batch(RowBatch* src, TRowBatch* dest, int num
         // int uncompressed_bytes = bytes - dest->tuple_data.size() + dest->uncompressed_size;
         // The size output_batch would be if we didn't compress tuple_data (will be equal to
         // actual batch size if tuple_data isn't compressed)
-
         COUNTER_UPDATE(_bytes_sent_counter, bytes * num_receivers);
         COUNTER_UPDATE(_uncompressed_bytes_counter, uncompressed_bytes * num_receivers);
     }
@@ -732,7 +735,6 @@ Status DataStreamSender::serialize_batch(RowBatch* src, TRowBatch* dest, int num
     return Status::OK;
 }
 
-
 int64_t DataStreamSender::get_num_data_bytes_sent() const {
     // TODO: do we need synchronization here or are reads & writes to 8-byte ints
     // atomic?
diff --git a/be/src/runtime/data_stream_sender.h b/be/src/runtime/data_stream_sender.h
index 9ebd637389..ba09e70dc5 100644
--- a/be/src/runtime/data_stream_sender.h
+++ b/be/src/runtime/data_stream_sender.h
@@ -30,6 +30,7 @@
 #include "common/status.h"
 #include "util/runtime_profile.h"
 #include "gen_cpp/Data_types.h"  // for TRowBatch
+#include "gen_cpp/data.pb.h"  // for PRowBatch
 
 #include "rpc/dispatch_handler.h"
 #include "rpc/io_handler.h"
@@ -93,7 +94,8 @@ public:
     /// Serializes the src batch into the dest thrift batch. Maintains metrics.
     /// num_receivers is the number of receivers this batch will be sent to. Only
     /// used to maintain metrics.
-    Status serialize_batch(RowBatch* src, TRowBatch* dest, int num_receivers = 1);
+    template
+    Status serialize_batch(RowBatch* src, T* dest, int num_receivers = 1);
 
     // Return total number of bytes sent in TRowBatch.data. If batches are
     // broadcast to multiple receivers, they are counted once per receiver.
@@ -136,12 +138,19 @@ private:
     TPartitionType::type _part_type;
     bool _ignore_not_found;
 
+    // use this flag to back-compatible for old data transmit
+    bool _use_brpc = false;
+
     // serialized batches for broadcasting; we need two so we can write
     // one while the other one is still being sent
     TRowBatch _thrift_batch1;
     TRowBatch _thrift_batch2;
     TRowBatch* _current_thrift_batch;  // the next one to fill in send()
 
+    PRowBatch _pb_batch1;
+    PRowBatch _pb_batch2;
+    PRowBatch* _current_pb_batch = nullptr;
+
     std::vector _partition_expr_ctxs;  // compute per-row partition values
 
     std::vector _channels;
diff --git a/be/src/runtime/decimal_value.cpp b/be/src/runtime/decimal_value.cpp
index c724497263..8e490713f5 100755
--- a/be/src/runtime/decimal_value.cpp
+++ b/be/src/runtime/decimal_value.cpp
@@ -498,6 +498,8 @@ int do_div_mod(
     const int32_t* buff1 = value1._buffer;
     const int32_t* buff2 = value2._buffer;
 
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
     // removing all the leading zeros
     // process value2
     int32_t first_big_digit_length = (prec2 - 1) % DIG_PER_DEC1 + 1;
@@ -529,6 +531,7 @@ int do_div_mod(
     for (; *buff1 < powers10[--first_big_digit_length];) {
         --prec1;
     }
+#pragma GCC diagnostic pop
 
     // 比较两个数的整形部分,得到结果的intg。 如果被除数较å°ï¼Œintg=0.
     int32_t dintg = (prec1 - frac1) - (prec2 - frac2) + (*buff1 >= *buff2);
diff --git a/be/src/runtime/decimal_value.h b/be/src/runtime/decimal_value.h
index 1850362201..5383a7a2ab 100755
--- a/be/src/runtime/decimal_value.h
+++ b/be/src/runtime/decimal_value.h
@@ -443,6 +443,9 @@ public:
     static const char* _s_llvm_class_name;
 
 private:
+
+    friend class MultiDistinctDecimalState;
+
     bool is_zero() const {
         const int32_t* buff = _buffer;
         const int32_t* end = buff + round_up(_int_length)
@@ -568,6 +571,8 @@ inline const int32_t* DecimalValue::get_first_no_zero_index(
         ++buff;
     }
 
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
     // When the value of a "big digit" is "000099999", its 'intg' may be 5/6/7/8/9,
     // we get accurate 'intg' here and the first no zero index of buff
     if (temp_intg > 0) {
@@ -578,6 +583,7 @@ inline const int32_t* DecimalValue::get_first_no_zero_index(
     } else {
         temp_intg = 0;
     }
+#pragma GCC diagnostic pop
     *int_digit_num = temp_intg;
     return buff;
 }
diff --git a/be/src/runtime/descriptors.cpp b/be/src/runtime/descriptors.cpp
index 84dad3e381..5caaa3127a 100644
--- a/be/src/runtime/descriptors.cpp
+++ b/be/src/runtime/descriptors.cpp
@@ -321,6 +321,14 @@ void RowDescriptor::to_thrift(std::vector* row_tuple_ids) {
     }
 }
 
+void RowDescriptor::to_protobuf(
+        google::protobuf::RepeatedField* row_tuple_ids) {
+    row_tuple_ids->Clear();
+    for (auto desc : _tuple_desc_map) {
+        row_tuple_ids->Add(desc->id());
+    }
+}
+
 bool RowDescriptor::is_prefix_of(const RowDescriptor& other_desc) const {
     if (_tuple_desc_map.size() > other_desc._tuple_desc_map.size()) {
         return false;
diff --git a/be/src/runtime/descriptors.h b/be/src/runtime/descriptors.h
index 262e960175..f2f778da3f 100644
--- a/be/src/runtime/descriptors.h
+++ b/be/src/runtime/descriptors.h
@@ -27,6 +27,9 @@
 #include 
 #include 
 
+#include 
+#include 
+
 #include "common/status.h"
 #include "common/global_types.h"
 #include "gen_cpp/Descriptors_types.h"  // for TTupleId
@@ -463,6 +466,8 @@ public:
 
     // Populate row_tuple_ids with our ids.
     void to_thrift(std::vector* row_tuple_ids);
+    void to_protobuf(
+        google::protobuf::RepeatedField* row_tuple_ids);
 
     // Return true if the tuple ids of this descriptor are a prefix
     // of the tuple ids of other_desc.
diff --git a/be/src/runtime/disk_io_mgr.cc b/be/src/runtime/disk_io_mgr.cc
index ab2f41e6fc..9727e12a62 100644
--- a/be/src/runtime/disk_io_mgr.cc
+++ b/be/src/runtime/disk_io_mgr.cc
@@ -722,20 +722,24 @@ char* DiskIoMgr::get_free_buffer(int64_t* buffer_size) {
     char* buffer = NULL;
     if (_free_buffers[idx].empty()) {
         ++_num_allocated_buffers;
+#if 0
         if (PaloMetrics::io_mgr_num_buffers() != NULL) {
             PaloMetrics::io_mgr_num_buffers()->increment(1L);
         }
         if (PaloMetrics::io_mgr_total_bytes() != NULL) {
             PaloMetrics::io_mgr_total_bytes()->increment(*buffer_size);
         }
+#endif
         // Update the process mem usage.  This is checked the next time we start
         // a read for the next reader (DiskIoMgr::GetNextScanRange)
         _process_mem_tracker->consume(*buffer_size);
         buffer = new char[*buffer_size];
     } else {
+#if 0
         if (PaloMetrics::io_mgr_num_unused_buffers() != NULL) {
             PaloMetrics::io_mgr_num_unused_buffers()->increment(-1L);
         }
+#endif
         buffer = _free_buffers[idx].front();
         _free_buffers[idx].pop_front();
     }
@@ -760,7 +764,7 @@ void DiskIoMgr::gc_io_buffers() {
         }
         _free_buffers[idx].clear();
     }
-
+#if 0
     if (PaloMetrics::io_mgr_num_buffers() != NULL) {
         PaloMetrics::io_mgr_num_buffers()->increment(-buffers_freed);
     }
@@ -770,6 +774,7 @@ void DiskIoMgr::gc_io_buffers() {
     if (PaloMetrics::io_mgr_num_unused_buffers() != NULL) {
         PaloMetrics::io_mgr_num_unused_buffers()->update(0);
     }
+#endif
 }
 
 void DiskIoMgr::return_free_buffer(BufferDescriptor* desc) {
@@ -787,19 +792,23 @@ void DiskIoMgr::return_free_buffer(char* buffer, int64_t buffer_size) {
     unique_lock lock(_free_buffers_lock);
     if (!config::disable_mem_pools && _free_buffers[idx].size() < config::max_free_io_buffers) {
         _free_buffers[idx].push_back(buffer);
+#if 0
         if (PaloMetrics::io_mgr_num_unused_buffers() != NULL) {
             PaloMetrics::io_mgr_num_unused_buffers()->increment(1L);
         }
+#endif
     } else {
         _process_mem_tracker->release(buffer_size);
         --_num_allocated_buffers;
         delete[] buffer;
+#if 0
         if (PaloMetrics::io_mgr_num_buffers() != NULL) {
             PaloMetrics::io_mgr_num_buffers()->increment(-1L);
         }
         if (PaloMetrics::io_mgr_total_bytes() != NULL) {
             PaloMetrics::io_mgr_total_bytes()->increment(-buffer_size);
         }
+#endif
     }
 }
 
@@ -1173,9 +1182,11 @@ Status DiskIoMgr::write_range_helper(FILE* file_handle, WriteRange* write_range)
                 << errno << " description=" << get_str_err_msg();
         return Status(error_msg.str());
     }
+#if 0
     if (PaloMetrics::io_mgr_bytes_written() != NULL) {
         PaloMetrics::io_mgr_bytes_written()->increment(write_range->_len);
     }
+#endif
 
     return Status::OK;
 }
diff --git a/be/src/runtime/disk_io_mgr.h b/be/src/runtime/disk_io_mgr.h
index dec3633355..e487ec762e 100644
--- a/be/src/runtime/disk_io_mgr.h
+++ b/be/src/runtime/disk_io_mgr.h
@@ -38,7 +38,7 @@
 #include "common/status.h"
 #include "util/error_util.h"
 #include "util/internal_queue.h"
-#include "util/palo_metrics.h"
+#include "util/metrics.h"
 #include "util/runtime_profile.h"
 #include "runtime/mem_tracker.h"
 
diff --git a/be/src/runtime/disk_io_mgr_scan_range.cc b/be/src/runtime/disk_io_mgr_scan_range.cc
index 7148bc1719..e15af98144 100644
--- a/be/src/runtime/disk_io_mgr_scan_range.cc
+++ b/be/src/runtime/disk_io_mgr_scan_range.cc
@@ -316,9 +316,11 @@ Status DiskIoMgr::ScanRange::open() {
         return Status(ss.str());
     }
     // }
+#if 0
     if (PaloMetrics::io_mgr_num_open_files() != NULL) {
         PaloMetrics::io_mgr_num_open_files()->increment(1L);
     }
+#endif
     return Status::OK;
 }
 
@@ -364,9 +366,11 @@ void DiskIoMgr::ScanRange::close() {
         fclose(_local_file);
         _local_file = NULL;
     }
+#if 0
     if (PaloMetrics::io_mgr_num_open_files() != NULL) {
         PaloMetrics::io_mgr_num_open_files()->increment(-1L);
     }
+#endif
 }
 
 /*
diff --git a/be/src/runtime/dpp_sink.cpp b/be/src/runtime/dpp_sink.cpp
index 55c5f88234..6a2461b3ba 100644
--- a/be/src/runtime/dpp_sink.cpp
+++ b/be/src/runtime/dpp_sink.cpp
@@ -24,6 +24,7 @@
 #include "exprs/slot_ref.h"
 #include "common/object_pool.h"
 #include "runtime/dpp_writer.h"
+#include "runtime/exec_env.h"
 #include "runtime/tuple_row.h"
 #include "runtime/runtime_state.h"
 #include "runtime/row_batch.h"
@@ -898,7 +899,7 @@ Status DppSink::finish(RuntimeState* state) {
     CountDownLatch latch(_translator_count);
     for (auto& iter : _translator_map) {
         for (auto& trans : iter.second) {
-            state->etl_thread_pool()->offer(
+            state->exec_env()->etl_thread_pool()->offer(
                 boost::bind(&DppSink::process, this, state, trans, &latch));
         }
     }
diff --git a/be/src/runtime/exec_env.cpp b/be/src/runtime/exec_env.cpp
index a65ce5240b..a9038c8678 100644
--- a/be/src/runtime/exec_env.cpp
+++ b/be/src/runtime/exec_env.cpp
@@ -25,7 +25,9 @@
 #include 
 
 #include "common/logging.h"
+#include "rpc/connection_manager.h"
 #include "runtime/broker_mgr.h"
+#include "runtime/bufferpool/buffer_pool.h"
 #include "runtime/client_cache.h"
 #include "runtime/data_stream_mgr.h"
 #include "runtime/disk_io_mgr.h"
@@ -34,6 +36,7 @@
 #include "runtime/thread_resource_mgr.h"
 #include "runtime/fragment_mgr.h"
 #include "runtime/tmp_file_mgr.h"
+#include "runtime/bufferpool/reservation_tracker.h"
 #include "util/metrics.h"
 #include "util/network_util.h"
 #include "http/webserver.h"
@@ -48,6 +51,7 @@
 #include "http/action/reload_tablet_action.h"
 #include "http/action/snapshot_action.h"
 #include "http/action/pprof_actions.h"
+#include "http/action/metrics_action.h"
 #include "http/download_action.h"
 #include "http/monitor_action.h"
 #include "http/http_method.h"
@@ -57,6 +61,9 @@
 #include "runtime/etl_job_mgr.h"
 #include "runtime/load_path_mgr.h"
 #include "runtime/pull_load_task_mgr.h"
+#include "util/pretty_printer.h"
+#include "util/palo_metrics.h"
+#include "util/brpc_stub_cache.h"
 #include "gen_cpp/BackendService.h"
 #include "gen_cpp/FrontendService.h"
 #include "gen_cpp/TPaloBrokerService.h"
@@ -74,7 +81,6 @@ ExecEnv::ExecEnv() :
         _broker_client_cache(new BrokerServiceClientCache()),
         _webserver(new Webserver()),
         _web_page_handler(new WebPageHandler(_webserver.get())),
-        _metrics(new MetricGroup("exec_env")),
         _mem_tracker(NULL),
         _pool_mem_trackers(new PoolMemTrackerRegistry),
         _thread_mgr(new ThreadResourceMgr),
@@ -94,10 +100,12 @@ ExecEnv::ExecEnv() :
         _bfd_parser(BfdParser::create()),
         _pull_load_task_mgr(new PullLoadTaskMgr(config::pull_load_task_dir)),
         _broker_mgr(new BrokerMgr(this)),
+        _brpc_stub_cache(new BrpcStubCache()),
         _enable_webserver(true),
         _tz_database(TimezoneDatabase()) {
-    _client_cache->init_metrics(_metrics.get(), "palo.backends");
-    //_frontend_client_cache->init_metrics(_metrics.get(), "frontend-server.backends");
+    _client_cache->init_metrics(PaloMetrics::metrics(), "backend");
+    _frontend_client_cache->init_metrics(PaloMetrics::metrics(), "frontend");
+    _broker_client_cache->init_metrics(PaloMetrics::metrics(), "broker");
     _result_mgr->init();
     _cgroups_mgr->init_cgroups();
     _etl_job_mgr->init();
@@ -138,6 +146,30 @@ Status ExecEnv::start_services() {
         return Status("Failed to parse mem limit from '" + config::mem_limit + "'.");
     }
 
+    std::stringstream ss;
+    if (!BitUtil::IsPowerOf2(config::FLAGS_min_buffer_size)) {
+        ss << "--min_buffer_size must be a power-of-two: " << config::FLAGS_min_buffer_size;
+        return Status(ss.str());
+    }
+
+    int64_t buffer_pool_limit = ParseUtil::parse_mem_spec(config::FLAGS_buffer_pool_limit,
+        &is_percent);
+    if (buffer_pool_limit <= 0) {
+        ss << "Invalid --buffer_pool_limit value, must be a percentage or "
+           "positive bytes value or percentage: " << config::FLAGS_buffer_pool_limit;
+        return Status(ss.str());
+    }
+    buffer_pool_limit = BitUtil::RoundDown(buffer_pool_limit, config::FLAGS_min_buffer_size);
+
+    int64_t clean_pages_limit = ParseUtil::parse_mem_spec(config::FLAGS_buffer_pool_clean_pages_limit,
+        &is_percent);
+    if (clean_pages_limit <= 0) {
+        ss << "Invalid --buffer_pool_clean_pages_limit value, must be a percentage or "
+              "positive bytes value or percentage: " << config::FLAGS_buffer_pool_clean_pages_limit;
+        return Status(ss.str());
+    }
+
+    init_buffer_pool(config::FLAGS_min_buffer_size, buffer_pool_limit, clean_pages_limit);
     // Limit of 0 means no memory limit.
     if (bytes_limit > 0) {
         _mem_tracker.reset(new MemTracker(bytes_limit));
@@ -163,8 +195,7 @@ Status ExecEnv::start_services() {
         LOG(INFO) << "Webserver is disabled";
     }
 
-    _metrics->init(_enable_webserver ? _web_page_handler.get() : NULL);
-    RETURN_IF_ERROR(_tmp_file_mgr->init(_metrics.get()));
+    RETURN_IF_ERROR(_tmp_file_mgr->init(PaloMetrics::metrics()));
 
     return Status::OK;
 }
@@ -210,6 +241,11 @@ Status ExecEnv::start_webserver() {
     // register pprof actions
     PprofActions::setup(this, _webserver.get());
 
+    {
+        auto action = _object_pool.add(new MetricsAction(PaloMetrics::metrics()));
+        _webserver->register_handler(HttpMethod::GET, "/metrics", action);
+    }
+
 #ifndef BE_TEST
     // Register BE checksum action
     ChecksumAction* checksum_action = new ChecksumAction(this);
@@ -232,8 +268,19 @@ uint32_t ExecEnv::cluster_id() {
     return OLAPRootPath::get_instance()->effective_cluster_id();
 }
 
+void ExecEnv::init_buffer_pool(int64_t min_page_size, int64_t capacity, int64_t clean_pages_limit) {
+  DCHECK(_buffer_pool == nullptr);
+  _buffer_pool.reset(new BufferPool(min_page_size, capacity, clean_pages_limit));
+  _buffer_reservation.reset(new ReservationTracker);
+  _buffer_reservation->InitRootTracker(nullptr, capacity);
+}
+
 const std::string& ExecEnv::token() const {
     return _master_info->token;
 }
 
+MetricRegistry* ExecEnv::metrics() const {
+    return PaloMetrics::metrics();
+}
+
 }
diff --git a/be/src/runtime/exec_env.h b/be/src/runtime/exec_env.h
index 7f4539e5b9..bde15b7b2c 100644
--- a/be/src/runtime/exec_env.h
+++ b/be/src/runtime/exec_env.h
@@ -27,6 +27,7 @@
 
 #include "agent/cgroups_mgr.h"
 #include "common/status.h"
+#include "common/object_pool.h"
 #include "exprs/timestamp_functions.h"
 #include "runtime/client_cache.h"
 #include "runtime/lib_cache.h"
@@ -34,8 +35,6 @@
 #include "util/priority_thread_pool.hpp"
 #include "util/thread_pool.hpp"
 
-#include "rpc/connection_manager.h"
-
 namespace palo {
 
 class DataStreamMgr;
@@ -43,7 +42,6 @@ class ResultBufferMgr;
 class TestExecEnv;
 class Webserver;
 class WebPageHandler;
-class MetricGroup;
 class MemTracker;
 class PoolMemTrackerRegistry;
 class ThreadResourceMgr;
@@ -56,6 +54,11 @@ class TmpFileMgr;
 class BfdParser;
 class PullLoadTaskMgr;
 class BrokerMgr;
+class MetricRegistry;
+class BufferPool;
+class ReservationTracker;
+class ConnectionManager;
+class BrpcStubCache;
 
 // Execution environment for queries/plan fragments.
 // Contains all required global structures, and handles to
@@ -78,6 +81,8 @@ public:
 
     const std::string& token() const;
 
+    MetricRegistry* metrics() const;
+
     DataStreamMgr* stream_mgr() {
         return _stream_mgr.get();
     }
@@ -99,9 +104,6 @@ public:
     WebPageHandler* web_page_handler() {
         return _web_page_handler.get();
     }
-    MetricGroup* metrics() {
-        return _metrics.get();
-    }
     MemTracker* process_mem_tracker() {
         return _mem_tracker.get();
     }
@@ -151,7 +153,11 @@ public:
         return _broker_mgr.get();
     }
 
-    ConnectionManagerPtr get_conn_manager() {
+    BrpcStubCache* brpc_stub_cache() const {
+        return _brpc_stub_cache.get();
+    }
+
+    std::shared_ptr get_conn_manager() {
         return _conn_mgr;
     }
 
@@ -165,6 +171,14 @@ public:
     // Initializes the exec env for running FE tests.
     Status init_for_tests();
 
+    ReservationTracker* buffer_reservation() { 
+        return _buffer_reservation.get(); 
+    }
+ 
+    BufferPool* buffer_pool() { 
+        return _buffer_pool.get(); 
+    }
+
 private:
     Status start_webserver();
     // Leave protected so that subclasses can override
@@ -175,7 +189,6 @@ private:
     std::unique_ptr_broker_client_cache;
     boost::scoped_ptr _webserver;
     boost::scoped_ptr _web_page_handler;
-    boost::scoped_ptr _metrics;
     boost::scoped_ptr _mem_tracker;
     boost::scoped_ptr _pool_mem_trackers;
     boost::scoped_ptr _thread_mgr;
@@ -192,18 +205,26 @@ private:
     std::unique_ptr _bfd_parser;
     std::unique_ptr _pull_load_task_mgr;
     std::unique_ptr _broker_mgr;
+    std::unique_ptr _brpc_stub_cache;
     bool _enable_webserver;
 
+    boost::scoped_ptr _buffer_reservation;
+    boost::scoped_ptr _buffer_pool;
+
     /*
     Comm* comm;
     DispatchHandlerPtr dhp;
     ApplicationQueue *app_queue;
     */
-    ConnectionManagerPtr _conn_mgr;
+    std::shared_ptr _conn_mgr;
 
+    ObjectPool _object_pool;
 private:
     static ExecEnv* _exec_env;
     TimezoneDatabase _tz_database;
+
+    /// Initialise 'buffer_pool_' and 'buffer_reservation_' with given capacity.
+    void init_buffer_pool(int64_t min_page_len, int64_t capacity, int64_t clean_pages_limit);
 };
 
 }
diff --git a/be/src/runtime/export_sink.cpp b/be/src/runtime/export_sink.cpp
index 0c9d3cc3d0..585b23145d 100644
--- a/be/src/runtime/export_sink.cpp
+++ b/be/src/runtime/export_sink.cpp
@@ -21,8 +21,10 @@
 #include "runtime/mysql_table_sink.h"
 #include "runtime/mem_tracker.h"
 #include "runtime/tuple_row.h"
+#include "runtime/row_batch.h"
 #include "util/runtime_profile.h"
 #include "util/debug_util.h"
+#include "util/types.h"
 #include "exec/local_file_writer.h"
 #include "exec/broker_writer.h"
 #include 
@@ -136,7 +138,7 @@ Status ExportSink::gen_row_buffer(TupleRow* row, std::stringstream* ss) {
             (*ss) << *static_cast(item);
             break;
         case TYPE_LARGEINT:
-            (*ss) << *static_cast<__int128*>(item);
+            (*ss) << reinterpret_cast(item)->value;
             break;
         case TYPE_FLOAT:
             (*ss) << *static_cast(item);
diff --git a/be/src/runtime/fragment_mgr.cpp b/be/src/runtime/fragment_mgr.cpp
index 0166d2d3c5..9927ace514 100644
--- a/be/src/runtime/fragment_mgr.cpp
+++ b/be/src/runtime/fragment_mgr.cpp
@@ -34,6 +34,7 @@
 #include "runtime/datetime_value.h"
 #include "util/stopwatch.hpp"
 #include "util/debug_util.h"
+#include "util/palo_metrics.h"
 #include "util/thrift_util.h"
 #include "gen_cpp/PaloInternalService_types.h"
 #include "gen_cpp/Types_types.h"
@@ -183,18 +184,20 @@ static void register_cgroups(const std::string& user, const std::string& group)
 }
 
 Status FragmentExecState::execute() {
-    MonotonicStopWatch watch;
-    watch.start();
-    // TODO(zc): add dpp into cgroups
-    if (_set_rsc_info) {
-        register_cgroups(_user, _group);
-    } else {
-        CgroupsMgr::apply_system_cgroup();
-    }
+    int64_t duration_ns = 0;
+    {
+        SCOPED_RAW_TIMER(&duration_ns);
+        if (_set_rsc_info) {
+            register_cgroups(_user, _group);
+        } else {
+            CgroupsMgr::apply_system_cgroup();
+        }
 
-    _executor.open();
-    _executor.close();
-    LOG(INFO) << "execute time is " << watch.elapsed_time() / 1000000;
+        _executor.open();
+        _executor.close();
+    }
+    PaloMetrics::fragment_requests_total.increment(1);
+    PaloMetrics::fragment_request_duration_us.increment(duration_ns / 1000);
     return Status::OK;
 }
 
diff --git a/be/src/runtime/initial_reservations.cc b/be/src/runtime/initial_reservations.cc
new file mode 100644
index 0000000000..f622a60f3d
--- /dev/null
+++ b/be/src/runtime/initial_reservations.cc
@@ -0,0 +1,95 @@
+// Modifications copyright (C) 2017, Baidu.com, Inc.
+// Copyright 2017 The Apache Software Foundation
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "runtime/initial_reservations.h"
+
+#include 
+
+#include 
+#include 
+
+#include "common/logging.h"
+#include "common/object_pool.h"
+#include "runtime/exec_env.h"
+#include "runtime/mem_tracker.h"
+#include "util/debug_util.h"
+#include "util/pretty_printer.h"
+
+#include "common/names.h"
+
+using std::numeric_limits;
+
+
+namespace palo {
+
+InitialReservations::InitialReservations(ObjectPool* obj_pool,
+    ReservationTracker* query_reservation, MemTracker* query_mem_tracker,
+    int64_t initial_reservation_total_claims)
+  : initial_reservation_mem_tracker_(obj_pool->add(
+      new MemTracker(-1, "Unclaimed reservations", query_mem_tracker, false))),
+      remaining_initial_reservation_claims_(initial_reservation_total_claims) {
+  initial_reservations_.InitChildTracker(nullptr, query_reservation,
+      initial_reservation_mem_tracker_, numeric_limits::max());
+}
+
+Status InitialReservations::Init(
+    const TUniqueId& query_id, int64_t query_min_reservation) {
+  DCHECK_EQ(0, initial_reservations_.GetReservation()) << "Already inited";
+  if (!initial_reservations_.IncreaseReservation(query_min_reservation)) {
+      Status status;
+      std::stringstream ss;
+      ss  << "Minimum reservation unavaliable: " << query_min_reservation
+          << " query id:" << query_id; 
+      status.add_error_msg(TStatusCode::MINIMUM_RESERVATION_UNAVAILABLE, ss.str());
+      return status;
+  }
+  VLOG_QUERY << "Successfully claimed initial reservations ("
+            << PrettyPrinter::print(query_min_reservation, TUnit::BYTES) << ") for"
+            << " query " << print_id(query_id);
+  return Status::OK;
+}
+
+void InitialReservations::Claim(BufferPool::ClientHandle* dst, int64_t bytes) {
+  DCHECK_GE(bytes, 0);
+  lock_guard l(lock_);
+  DCHECK_LE(bytes, remaining_initial_reservation_claims_);
+  bool success = dst->TransferReservationFrom(&initial_reservations_, bytes);
+  DCHECK(success) << "Planner computation should ensure enough initial reservations";
+  remaining_initial_reservation_claims_ -= bytes;
+}
+
+void InitialReservations::Return(BufferPool::ClientHandle* src, int64_t bytes) {
+  lock_guard l(lock_);
+  bool success = src->TransferReservationTo(&initial_reservations_, bytes);
+  // No limits on our tracker - no way this should fail.
+  DCHECK(success);
+  // Check to see if we can release any reservation.
+  int64_t excess_reservation =
+    initial_reservations_.GetReservation() - remaining_initial_reservation_claims_;
+  if (excess_reservation > 0) {
+    initial_reservations_.DecreaseReservation(excess_reservation);
+  }
+}
+
+void InitialReservations::ReleaseResources() {
+  initial_reservations_.Close();
+  initial_reservation_mem_tracker_->close();
+}
+}
diff --git a/be/src/runtime/initial_reservations.h b/be/src/runtime/initial_reservations.h
new file mode 100644
index 0000000000..e49fe5321b
--- /dev/null
+++ b/be/src/runtime/initial_reservations.h
@@ -0,0 +1,84 @@
+// Modifications copyright (C) 2017, Baidu.com, Inc.
+// Copyright 2017 The Apache Software Foundation
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef BDG_PALO_BE_RUNTIME_INITIAL_RESERVATIONS_H
+#define BDG_PALO_BE_RUNTIME_INITIAL_RESERVATIONS_H
+
+#include "common/status.h"
+#include "gen_cpp/Types_types.h" // for TUniqueId
+#include "runtime/bufferpool/buffer_pool.h"
+#include "runtime/bufferpool/reservation_tracker.h"
+#include "util/spinlock.h"
+
+namespace palo {
+
+class ObjectPool;
+
+/**
+ * Manages the pool of initial reservations for different nodes in the plan tree.
+ * Each plan node and sink claims its initial reservation from here, then returns it when
+ * it is done executing. The frontend is responsible for making sure that enough initial
+ * reservation is in this pool for all of the concurrent claims.
+ */
+class InitialReservations {
+ public:
+  /// 'query_reservation' and 'query_mem_tracker' are the top-level trackers for the
+  /// query. This creates trackers for initial reservations under those.
+  /// 'initial_reservation_total_claims' is the total of initial reservations that will be
+  /// claimed over the lifetime of the query. The total bytes claimed via Claim()
+  /// cannot exceed this. Allocated objects are stored in 'obj_pool'.
+  InitialReservations(ObjectPool* obj_pool, ReservationTracker* query_reservation,
+      MemTracker* query_mem_tracker, int64_t initial_reservation_total_claims);
+
+  /// Initialize the query's pool of initial reservations by acquiring the minimum
+  /// reservation required for the query on this host. Fails if the reservation could
+  /// not be acquired, e.g. because it would exceed a pool or process limit.
+  Status Init(
+      const TUniqueId& query_id, int64_t query_min_reservation) WARN_UNUSED_RESULT;
+
+  /// Claim the initial reservation of 'bytes' for 'dst'. Assumes that the transfer will
+  /// not violate any reservation limits on 'dst'.
+  void Claim(BufferPool::ClientHandle* dst, int64_t bytes);
+
+  /// Return the initial reservation of 'bytes' from 'src'. The reservation is returned
+  /// to the pool of reservations if it may be needed to satisfy a subsequent claim or
+  /// otherwise is released.
+  void Return(BufferPool::ClientHandle* src, int64_t bytes);
+
+  /// Release any reservations held onto by this object.
+  void ReleaseResources();
+
+ private:
+  // Protects all below members to ensure that the internal state is consistent.
+  SpinLock lock_;
+
+  // The pool of initial reservations that Claim() returns reservations from and
+  // Return() returns reservations to.
+  ReservationTracker initial_reservations_;
+
+  MemTracker* const initial_reservation_mem_tracker_;
+
+  /// The total bytes of additional reservations that we expect to be claimed.
+  /// initial_reservations_->GetReservation() <= remaining_initial_reservation_claims_.
+  int64_t remaining_initial_reservation_claims_;
+};
+}
+
+#endif
diff --git a/be/src/runtime/mem_pool.cpp b/be/src/runtime/mem_pool.cpp
index 11ed7e1ba7..06637f856d 100644
--- a/be/src/runtime/mem_pool.cpp
+++ b/be/src/runtime/mem_pool.cpp
@@ -20,334 +20,257 @@
 
 #include "runtime/mem_pool.h"
 #include "runtime/mem_tracker.h"
-#include "runtime/mem_tracker.h"
+#include "util/bit_util.h"
 #include "util/palo_metrics.h"
 
 #include 
 #include 
 #include 
 
-namespace palo {
+#include "common/names.h"
+
+using namespace palo;
 
 #define MEM_POOL_POISON (0x66aa77bb)
 
-const int MemPool::DEFAULT_INITIAL_CHUNK_SIZE;
-const int64_t MemPool::MAX_CHUNK_SIZE;
+const int MemPool::INITIAL_CHUNK_SIZE;
+const int MemPool::MAX_CHUNK_SIZE;
 
-const char* MemPool::_s_llvm_class_name = "class.palo::MemPool";
+const char* MemPool::LLVM_CLASS_NAME = "class.impala::MemPool";
+const int MemPool::DEFAULT_ALIGNMENT;
+uint32_t MemPool::zero_length_region_  = MEM_POOL_POISON;
 
-// uint32_t MemPool::_s_zero_length_region alignas(max_align_t) = MEM_POOL_POISON;
-uint32_t MemPool::_s_zero_length_region = MEM_POOL_POISON;
-
-MemPool::MemPool(MemTracker* mem_tracker, int chunk_size) :
-        _current_chunk_idx(-1),
-        _last_offset_conversion_chunk_idx(-1),
-        // round up chunk size to nearest 8 bytes
-        _chunk_size(chunk_size == 0 ? 0 : ((chunk_size + 7) / 8) * 8),
-        _total_allocated_bytes(0),
-        // _total_chunk_bytes(0),
-        _peak_allocated_bytes(0),
-        _total_reserved_bytes(0),
-        _mem_tracker(mem_tracker) {
-    DCHECK_GE(_chunk_size, 0);
-    DCHECK(mem_tracker != NULL);
+MemPool::MemPool(MemTracker* mem_tracker)
+  : current_chunk_idx_(-1),
+    next_chunk_size_(INITIAL_CHUNK_SIZE),
+    total_allocated_bytes_(0),
+    total_reserved_bytes_(0),
+    peak_allocated_bytes_(0),
+    mem_tracker_(mem_tracker) {
+  DCHECK(mem_tracker != NULL);
+  DCHECK_EQ(zero_length_region_, MEM_POOL_POISON);
 }
 
-MemPool::ChunkInfo::ChunkInfo(int64_t size, uint8_t* buf) :
-        owns_data(true),
-        data(buf),
-        size(size),
-        cumulative_allocated_bytes(0),
-        allocated_bytes(0) {
-    if (PaloMetrics::mem_pool_total_bytes() != NULL) {
-        PaloMetrics::mem_pool_total_bytes()->increment(size);
-    }
+MemPool::ChunkInfo::ChunkInfo(int64_t size, uint8_t* buf)
+  : data(buf),
+    size(size),
+    allocated_bytes(0) {
+   PaloMetrics::memory_pool_bytes_total.increment(size);
 }
 
 MemPool::~MemPool() {
-    int64_t total_bytes_released = 0;
-    for (size_t i = 0; i < _chunks.size(); ++i) {
-        if (!_chunks[i].owns_data) {
-            continue;
-        }
+  int64_t total_bytes_released = 0;
+  for (size_t i = 0; i < chunks_.size(); ++i) {
+    total_bytes_released += chunks_[i].size;
+    free(chunks_[i].data);
+  }
+ 
+  mem_tracker_->release(total_bytes_released);
+  //TODO chenhao , check all using MemPool and open it
+  //DCHECK(chunks_.empty()) << "Must call FreeAll() or AcquireData() for this pool";
 
-        total_bytes_released += _chunks[i].size;
-        free(_chunks[i].data);
-    }
-    _chunks.clear();
+  PaloMetrics::memory_pool_bytes_total.increment(-total_bytes_released);
 
-    _mem_tracker->release(total_bytes_released);
-    // DCHECK(_chunks.empty()) << "Must call FreeAll() or AcquireData() for this pool";
+  //DCHECK_EQ(zero_length_region_, MEM_POOL_POISON);
+}
 
-    if (PaloMetrics::mem_pool_total_bytes() != NULL) {
-        PaloMetrics::mem_pool_total_bytes()->increment(-total_bytes_released);
-    }
+void MemPool::clear() {
+  current_chunk_idx_ = -1;
+  for (auto& chunk: chunks_) {
+    chunk.allocated_bytes = 0;
+    ASAN_POISON_MEMORY_REGION(chunk.data, chunk.size);
+  }
+  total_allocated_bytes_ = 0;
+  DCHECK(CheckIntegrity(false));
 }
 
 void MemPool::free_all() {
-    int64_t total_bytes_released = 0;
-    for (size_t i = 0; i < _chunks.size(); ++i) {
-        if (!_chunks[i].owns_data) {
-            continue;
-        }
-        total_bytes_released += _chunks[i].size;
-        free(_chunks[i].data);
-    }
-    _chunks.clear();
-    _current_chunk_idx = -1;
-    _last_offset_conversion_chunk_idx = -1;
-    _total_allocated_bytes = 0;
-    _total_reserved_bytes = 0;
+  int64_t total_bytes_released = 0;
+  for (auto& chunk: chunks_) {
+    total_bytes_released += chunk.size;
+    free(chunk.data);
+  }
+  chunks_.clear();
+  next_chunk_size_ = INITIAL_CHUNK_SIZE;
+  current_chunk_idx_ = -1;
+  total_allocated_bytes_ = 0;
+  total_reserved_bytes_ = 0;
 
-    _mem_tracker->release(total_bytes_released);
-    if (PaloMetrics::mem_pool_total_bytes() != NULL) {
-        PaloMetrics::mem_pool_total_bytes()->increment(-total_bytes_released);
-    }
+  mem_tracker_->release(total_bytes_released);
+  PaloMetrics::memory_pool_bytes_total.increment(-total_bytes_released);
 }
 
-bool MemPool::find_chunk(int64_t min_size, bool check_limits) {
-    // Try to allocate from a free chunk. The first free chunk, if any, will be immediately
-    // after the current chunk.
-    int first_free_idx = _current_chunk_idx + 1;
-
-    // (cast size() to signed int in order to avoid everything else being cast to
-    // unsigned long, in particular -1)
-    while (++_current_chunk_idx  < static_cast(_chunks.size())) {
-        // we found a free chunk
-        DCHECK_EQ(_chunks[_current_chunk_idx].allocated_bytes, 0);
-        if (_chunks[_current_chunk_idx].size >= min_size) {
-            // This chunk is big enough.  Move it before the other free chunks.
-            if (_current_chunk_idx != first_free_idx) {
-                std::swap(_chunks[_current_chunk_idx], _chunks[first_free_idx]);
-                _current_chunk_idx = first_free_idx;
-            }
-            break;
-        }
+bool MemPool::FindChunk(size_t min_size, bool check_limits) {
+  // Try to allocate from a free chunk. We may have free chunks after the current chunk
+  // if Clear() was called. The current chunk may be free if ReturnPartialAllocation()
+  // was called. The first free chunk (if there is one) can therefore be either the
+  // current chunk or the chunk immediately after the current chunk.
+  int first_free_idx;
+  if (current_chunk_idx_ == -1) {
+    first_free_idx = 0;
+  } else {
+    DCHECK_GE(current_chunk_idx_, 0);
+    first_free_idx = current_chunk_idx_ +
+        (chunks_[current_chunk_idx_].allocated_bytes > 0);
+  }
+  for (int idx = current_chunk_idx_ + 1; idx < chunks_.size(); ++idx) {
+    // All chunks after 'current_chunk_idx_' should be free.
+    DCHECK_EQ(chunks_[idx].allocated_bytes, 0);
+    if (chunks_[idx].size >= min_size) {
+      // This chunk is big enough. Move it before the other free chunks.
+      if (idx != first_free_idx) std::swap(chunks_[idx], chunks_[first_free_idx]);
+      current_chunk_idx_ = first_free_idx;
+      DCHECK(CheckIntegrity(true));
+      return true;
     }
+  }
 
-    if (_current_chunk_idx == static_cast(_chunks.size())) {
-        // need to allocate new chunk.
-        int64_t chunk_size = _chunk_size;
-        if (chunk_size == 0) {
-            if (_current_chunk_idx == 0) {
-                chunk_size = DEFAULT_INITIAL_CHUNK_SIZE;
-            } else {
-                // double the size of the last chunk in the list, up to a maximum
-                // TODO: stick with constant sizes throughout?
-                chunk_size = std::min(_chunks[_current_chunk_idx - 1].size * 2, MAX_CHUNK_SIZE);
-            }
-        }
-        chunk_size = std::max(min_size, chunk_size);
+  // Didn't find a big enough free chunk - need to allocate new chunk.
+  size_t chunk_size = 0;
+  DCHECK_LE(next_chunk_size_, MAX_CHUNK_SIZE);
 
-        if (check_limits) {
-            if (!_mem_tracker->try_consume(chunk_size)) {
-                // We couldn't allocate a new chunk so _current_chunk_idx is now be past the
-                // end of _chunks.
-                DCHECK_EQ(_current_chunk_idx, static_cast(_chunks.size()));
-                _current_chunk_idx = static_cast(_chunks.size()) - 1;
-                return false;
-            }
-        } else {
-            _mem_tracker->consume(chunk_size);
-        }
+  if (config::FLAGS_disable_mem_pools) {
+    // Disable pooling by sizing the chunk to fit only this allocation.
+    // Make sure the alignment guarantees are respected.
+    chunk_size = std::max(min_size, alignof(max_align_t));
+  } else {
+    DCHECK_GE(next_chunk_size_, INITIAL_CHUNK_SIZE);
+    chunk_size = max(min_size, next_chunk_size_);
+  }
 
-        // Allocate a new chunk. Return early if malloc fails.
-        uint8_t* buf = reinterpret_cast(malloc(chunk_size));
-        if (UNLIKELY(buf == NULL)) {
-            _mem_tracker->release(chunk_size);
-            DCHECK_EQ(_current_chunk_idx, static_cast(_chunks.size()));
-            _current_chunk_idx = static_cast(_chunks.size()) - 1;
-            return false;
-        }
+  if (check_limits) {
+    if (!mem_tracker_->try_consume(chunk_size)) return false;
+  } else {
+    mem_tracker_->consume(chunk_size);
+  }
 
-        // If there are no free chunks put it at the end, otherwise before the first free.
-        if (first_free_idx == static_cast(_chunks.size())) {
-            _chunks.push_back(ChunkInfo(chunk_size, buf));
-        } else {
-            _current_chunk_idx = first_free_idx;
-            std::vector::iterator insert_chunk = _chunks.begin() + _current_chunk_idx;
-            _chunks.insert(insert_chunk, ChunkInfo(chunk_size, buf));
-        }
-        _total_reserved_bytes += chunk_size;
-    }
+  // Allocate a new chunk. Return early if malloc fails.
+  uint8_t* buf = reinterpret_cast(malloc(chunk_size));
+  if (UNLIKELY(buf == NULL)) {
+    mem_tracker_->release(chunk_size);
+    return false;
+  }
 
-    if (_current_chunk_idx > 0) {
-        ChunkInfo& prev_chunk = _chunks[_current_chunk_idx - 1];
-        _chunks[_current_chunk_idx].cumulative_allocated_bytes =
-            prev_chunk.cumulative_allocated_bytes + prev_chunk.allocated_bytes;
-    }
+  ASAN_POISON_MEMORY_REGION(buf, chunk_size);
 
-    DCHECK_LT(_current_chunk_idx, static_cast(_chunks.size()));
-    DCHECK(check_integrity(true));
-    return true;
+  // Put it before the first free chunk. If no free chunks, it goes at the end.
+  if (first_free_idx == static_cast(chunks_.size())) {
+    chunks_.push_back(ChunkInfo(chunk_size, buf));
+  } else {
+    chunks_.insert(chunks_.begin() + first_free_idx, ChunkInfo(chunk_size, buf));
+  }
+  current_chunk_idx_ = first_free_idx;
+  total_reserved_bytes_ += chunk_size;
+  // Don't increment the chunk size until the allocation succeeds: if an attempted
+  // large allocation fails we don't want to increase the chunk size further.
+  next_chunk_size_ = static_cast(min(chunk_size * 2, MAX_CHUNK_SIZE));
+
+  DCHECK(CheckIntegrity(true));
+  return true;
 }
 
 void MemPool::acquire_data(MemPool* src, bool keep_current) {
-    DCHECK(src->check_integrity(false));
-    int num_acquired_chunks = 0;
+  DCHECK(src->CheckIntegrity(false));
+  int num_acquired_chunks;
+  if (keep_current) {
+    num_acquired_chunks = src->current_chunk_idx_;
+  } else if (src->GetFreeOffset() == 0) {
+    // nothing in the last chunk
+    num_acquired_chunks = src->current_chunk_idx_;
+  } else {
+    num_acquired_chunks = src->current_chunk_idx_ + 1;
+  }
 
-    if (keep_current) {
-        num_acquired_chunks = src->_current_chunk_idx;
-    } else if (src->get_free_offset() == 0) {
-        // nothing in the last chunk
-        num_acquired_chunks = src->_current_chunk_idx;
-    } else {
-        num_acquired_chunks = src->_current_chunk_idx + 1;
-    }
+  if (num_acquired_chunks <= 0) {
+    if (!keep_current) src->free_all();
+    return;
+  }
 
-    if (num_acquired_chunks <= 0) {
-        if (!keep_current) {
-            src->free_all();
-        }
-        return;
-    }
+  vector::iterator end_chunk = src->chunks_.begin() + num_acquired_chunks;
+  int64_t total_transfered_bytes = 0;
+  for (vector::iterator i = src->chunks_.begin(); i != end_chunk; ++i) {
+    total_transfered_bytes += i->size;
+  }
+  src->total_reserved_bytes_ -= total_transfered_bytes;
+  total_reserved_bytes_ += total_transfered_bytes;
 
-    std::vector::iterator end_chunk = src->_chunks.begin() + num_acquired_chunks;
-    int64_t total_transfered_bytes = 0;
-    for (std::vector::iterator i = src->_chunks.begin(); i != end_chunk; ++i) {
-        total_transfered_bytes += i->size;
-    }
-    src->_total_reserved_bytes -= total_transfered_bytes;
-    _total_reserved_bytes += total_transfered_bytes;
+  // Skip unnecessary atomic ops if the mem_trackers are the same.
+  if (src->mem_tracker_ != mem_tracker_) {
+    src->mem_tracker_->release(total_transfered_bytes);
+    mem_tracker_->consume(total_transfered_bytes);
+  }
 
-    src->_mem_tracker->release(total_transfered_bytes);
-    _mem_tracker->consume(total_transfered_bytes);
+  // insert new chunks after current_chunk_idx_
+  vector::iterator insert_chunk = chunks_.begin() + current_chunk_idx_ + 1;
+  chunks_.insert(insert_chunk, src->chunks_.begin(), end_chunk);
+  src->chunks_.erase(src->chunks_.begin(), end_chunk);
+  current_chunk_idx_ += num_acquired_chunks;
 
-    // insert new chunks after _current_chunk_idx
-    std::vector::iterator insert_chunk = _chunks.begin() + _current_chunk_idx + 1;
-    _chunks.insert(insert_chunk, src->_chunks.begin(), end_chunk);
-    src->_chunks.erase(src->_chunks.begin(), end_chunk);
-    _current_chunk_idx += num_acquired_chunks;
+  if (keep_current) {
+    src->current_chunk_idx_ = 0;
+    DCHECK(src->chunks_.size() == 1 || src->chunks_[1].allocated_bytes == 0);
+    total_allocated_bytes_ += src->total_allocated_bytes_ - src->GetFreeOffset();
+    src->total_allocated_bytes_ = src->GetFreeOffset();
+  } else {
+    src->current_chunk_idx_ = -1;
+    total_allocated_bytes_ += src->total_allocated_bytes_;
+    src->total_allocated_bytes_ = 0;
+  }
 
-    if (keep_current) {
-        src->_current_chunk_idx = 0;
-        DCHECK(src->_chunks.size() == 1 || src->_chunks[1].allocated_bytes == 0);
-        _total_allocated_bytes += src->_total_allocated_bytes - src->get_free_offset();
-        src->_chunks[0].cumulative_allocated_bytes = 0;
-        src->_total_allocated_bytes = src->get_free_offset();
-    } else {
-        src->_current_chunk_idx = -1;
-        _total_allocated_bytes += src->_total_allocated_bytes;
-        src->_total_allocated_bytes = 0;
-    }
-    _peak_allocated_bytes = std::max(_total_allocated_bytes, _peak_allocated_bytes);
+  peak_allocated_bytes_ = std::max(total_allocated_bytes_, peak_allocated_bytes_);
 
-    // recompute cumulative_allocated_bytes
-    int start_idx = _chunks.size() - num_acquired_chunks;
-    int64_t cumulative_bytes = (start_idx == 0
-                                ? 0
-                                : _chunks[start_idx - 1].cumulative_allocated_bytes
-                                + _chunks[start_idx - 1].allocated_bytes);
-    for (int i = start_idx; i <= _current_chunk_idx; ++i) {
-        _chunks[i].cumulative_allocated_bytes = cumulative_bytes;
-        cumulative_bytes += _chunks[i].allocated_bytes;
-    }
-
-    if (!keep_current) {
-        src->free_all();
-    }
-    DCHECK(check_integrity(false));
+  if (!keep_current) src->free_all();
+  DCHECK(src->CheckIntegrity(false));
+  DCHECK(CheckIntegrity(false));
 }
 
-bool MemPool::contains(uint8_t* ptr, int size) {
-    for (int i = 0; i < _chunks.size(); ++i) {
-        const ChunkInfo& info = _chunks[i];
-        if (ptr >= info.data && ptr < info.data + info.allocated_bytes) {
-            if (ptr + size > info.data + info.allocated_bytes) {
-                DCHECK_LE(reinterpret_cast(ptr + size),
-                          reinterpret_cast(info.data + info.allocated_bytes));
-                return false;
-            }
-            return true;
-        }
-    }
-    return false;
-}
-
-std::string MemPool::debug_string() {
-    std::stringstream out;
-    char str[16];
-    out << "MemPool(#chunks=" << _chunks.size() << " [";
-    for (int i = 0; i < _chunks.size(); ++i) {
-        snprintf(str, 16, "0x%lx=", reinterpret_cast(_chunks[i].data));
-        out << (i > 0 ? " " : "")
-            << str
-            << _chunks[i].size
-            << "/" << _chunks[i].cumulative_allocated_bytes
-            << "/" << _chunks[i].allocated_bytes;
-    }
-
-    out << "] current_chunk=" << _current_chunk_idx
-        << " total_sizes=" << get_total_chunk_sizes()
-        << " total_alloc=" << _total_allocated_bytes
-        << ")";
-    return out.str();
+string MemPool::DebugString() {
+  stringstream out;
+  char str[16];
+  out << "MemPool(#chunks=" << chunks_.size() << " [";
+  for (int i = 0; i < chunks_.size(); ++i) {
+    sprintf(str, "0x%lx=", reinterpret_cast(chunks_[i].data));
+    out << (i > 0 ? " " : "")
+        << str
+        << chunks_[i].size
+        << "/" << chunks_[i].allocated_bytes;
+  }
+  out << "] current_chunk=" << current_chunk_idx_
+      << " total_sizes=" << get_total_chunk_sizes()
+      << " total_alloc=" << total_allocated_bytes_
+      << ")";
+  return out.str();
 }
 
 int64_t MemPool::get_total_chunk_sizes() const {
-    int64_t result = 0;
-    for (int i = 0; i < _chunks.size(); ++i) {
-        result += _chunks[i].size;
-    }
-    return result;
+  int64_t result = 0;
+  for (int i = 0; i < chunks_.size(); ++i) {
+    result += chunks_[i].size;
+  }
+  return result;
 }
 
-bool MemPool::check_integrity(bool current_chunk_empty) {
-    // check that _current_chunk_idx points to the last chunk with allocated data
-    DCHECK_LT(_current_chunk_idx, static_cast(_chunks.size()));
-    int64_t total_allocated = 0;
-    for (int i = 0; i < _chunks.size(); ++i) {
-        DCHECK_GT(_chunks[i].size, 0);
-        if (i < _current_chunk_idx) {
-            DCHECK_GT(_chunks[i].allocated_bytes, 0);
-        } else if (i == _current_chunk_idx) {
-            if (current_chunk_empty) {
-                DCHECK_EQ(_chunks[i].allocated_bytes, 0);
-            } else {
-                DCHECK_GT(_chunks[i].allocated_bytes, 0);
-            }
-        } else {
-            DCHECK_EQ(_chunks[i].allocated_bytes, 0);
-        }
+bool MemPool::CheckIntegrity(bool check_current_chunk_empty) {
+  DCHECK_EQ(zero_length_region_, MEM_POOL_POISON);
+  DCHECK_LT(current_chunk_idx_, static_cast(chunks_.size()));
 
-        if (i > 0 && i <= _current_chunk_idx) {
-            DCHECK_EQ(_chunks[i - 1].cumulative_allocated_bytes + _chunks[i - 1].allocated_bytes,
-                      _chunks[i].cumulative_allocated_bytes);
-        }
+  // Without pooling, there are way too many chunks and this takes too long.
+  if (config::FLAGS_disable_mem_pools) return true;
 
-        if (_chunk_size != 0) {
-            DCHECK_GE(_chunks[i].size, _chunk_size);
-        }
-        total_allocated += _chunks[i].allocated_bytes;
+  // check that current_chunk_idx_ points to the last chunk with allocated data
+  int64_t total_allocated = 0;
+  for (int i = 0; i < chunks_.size(); ++i) {
+    DCHECK_GT(chunks_[i].size, 0);
+    if (i < current_chunk_idx_) {
+      DCHECK_GT(chunks_[i].allocated_bytes, 0);
+    } else if (i == current_chunk_idx_) {
+      DCHECK_GE(chunks_[i].allocated_bytes, 0);
+      if (check_current_chunk_empty) DCHECK_EQ(chunks_[i].allocated_bytes, 0);
+    } else {
+      DCHECK_EQ(chunks_[i].allocated_bytes, 0);
     }
-
-    DCHECK_EQ(total_allocated, _total_allocated_bytes);
-    return true;
+    total_allocated += chunks_[i].allocated_bytes;
+  }
+  DCHECK_EQ(total_allocated, total_allocated_bytes_);
+  return true;
 }
-
-void MemPool::get_chunk_info(std::vector >* chunk_info) {
-    chunk_info->clear();
-    for (std::vector::iterator info = _chunks.begin(); info != _chunks.end(); ++info) {
-        chunk_info->push_back(std::make_pair(info->data, info->allocated_bytes));
-    }
-}
-
-std::string MemPool::debug_print() {
-    char str[3];
-    std::stringstream out;
-    for (int i = 0; i < _chunks.size(); ++i) {
-        ChunkInfo& info = _chunks[i];
-
-        if (info.allocated_bytes == 0) {
-            return out.str();
-        }
-
-        for (int j = 0; j < info.allocated_bytes; ++j) {
-            snprintf(str, 3, "%x ", info.data[j]);
-            out << str;
-        }
-    }
-    return out.str();
-}
-
-} // end namespace palo
diff --git a/be/src/runtime/mem_pool.h b/be/src/runtime/mem_pool.h
index 945c221cec..edbc6e38c5 100644
--- a/be/src/runtime/mem_pool.h
+++ b/be/src/runtime/mem_pool.h
@@ -18,300 +18,282 @@
 // specific language governing permissions and limitations
 // under the License.
 
+
 #ifndef BDG_PALO_BE_RUNTIME_MEM_POOL_H
 #define BDG_PALO_BE_RUNTIME_MEM_POOL_H
 
 #include 
-#include 
 
-#include 
 #include 
-#include 
+#include 
 #include 
+#include 
 
-#include "common/compiler_util.h"
 #include "common/logging.h"
+#include "gutil/dynamic_annotations.h"
 #include "util/bit_util.h"
-#include "util/debug_util.h"
 
 namespace palo {
 
 class MemTracker;
 
-// A MemPool maintains a list of memory chunks from which it allocates memory in
-// response to allocate() calls;
-// Chunks stay around for the lifetime of the mempool or until they are passed on to
-// another mempool.
+/// A MemPool maintains a list of memory chunks from which it allocates memory in
+/// response to Allocate() calls;
+/// Chunks stay around for the lifetime of the mempool or until they are passed on to
+/// another mempool.
 //
-// The caller registers a MemTrackers with the pool; chunk allocations are counted
-// against that tracker and all of its ancestors. If chunks get moved between pools
-// during AcquireData() calls, the respective MemTrackers are updated accordingly.
-// Chunks freed up in the d'tor are subtracted from the registered limits.
+/// The caller registers a MemTracker with the pool; chunk allocations are counted
+/// against that tracker and all of its ancestors. If chunks get moved between pools
+/// during AcquireData() calls, the respective MemTrackers are updated accordingly.
+/// Chunks freed up in the d'tor are subtracted from the registered trackers.
 //
-// An allocate() call will attempt to allocate memory from the chunk that was most
-// recently added; if that chunk doesn't have enough memory to
-// satisfy the allocation request, the free chunks are searched for one that is
-// big enough otherwise a new chunk is added to the list.
-// The _current_chunk_idx always points to the last chunk with allocated memory.
-// In order to keep allocation overhead low, chunk sizes double with each new one
-// added, until they hit a maximum size.
+/// An Allocate() call will attempt to allocate memory from the chunk that was most
+/// recently added; if that chunk doesn't have enough memory to
+/// satisfy the allocation request, the free chunks are searched for one that is
+/// big enough otherwise a new chunk is added to the list.
+/// In order to keep allocation overhead low, chunk sizes double with each new one
+/// added, until they hit a maximum size.
+///
+/// Allocated chunks can be reused for new allocations if Clear() is called to free
+/// all allocations or ReturnPartialAllocation() is called to return part of the last
+/// allocation.
+///
+/// All chunks before 'current_chunk_idx_' have allocated memory, while all chunks
+/// after 'current_chunk_idx_' are free. The chunk at 'current_chunk_idx_' may or may
+/// not have allocated memory.
+///
+///     Example:
+///     MemPool* p = new MemPool();
+///     for (int i = 0; i < 1024; ++i) {
+/// returns 8-byte aligned memory (effectively 24 bytes):
+///       .. = p->Allocate(17);
+///     }
+/// at this point, 17K have been handed out in response to Allocate() calls and
+/// 28K of chunks have been allocated (chunk sizes: 4K, 8K, 16K)
+/// We track total and peak allocated bytes. At this point they would be the same:
+/// 28k bytes.  A call to Clear will return the allocated memory so
+/// total_allocated_bytes_ becomes 0.
+///     p->Clear();
+/// the entire 1st chunk is returned:
+///     .. = p->Allocate(4 * 1024);
+/// 4K of the 2nd chunk are returned:
+///     .. = p->Allocate(4 * 1024);
+/// a new 20K chunk is created
+///     .. = p->Allocate(20 * 1024);
 //
-//     Example:
-//     MemPool* p = new MemPool();
-//     for (int i = 0; i < 1024; ++i) {
-// returns 8-byte aligned memory (effectively 24 bytes):
-//       .. = p->allocate(17);
-//     }
-// at this point, 17K have been handed out in response to allocate() calls and
-// 28K of chunks have been allocated (chunk sizes: 4K, 8K, 16K)
-// We track total and peak allocated bytes. At this point they would be the same:
-// 28k bytes.  A call to Clear will return the allocated memory so
-// _total_allocate_bytes
-// becomes 0 while _peak_allocate_bytes remains at 28k.
-//     p->Clear();
-// the entire 1st chunk is returned:
-//     .. = p->allocate(4 * 1024);
-// 4K of the 2nd chunk are returned:
-//     .. = p->allocate(4 * 1024);
-// a new 20K chunk is created
-//     .. = p->allocate(20 * 1024);
-//
-//      MemPool* p2 = new MemPool();
-// the new mempool receives all chunks containing data from p
-//      p2->AcquireData(p, false);
-// At this point p._total_allocated_bytes would be 0 while p._peak_allocated_bytes
-// remains unchanged.
-// The one remaining (empty) chunk is released:
-//    delete p;
+///      MemPool* p2 = new MemPool();
+/// the new mempool receives all chunks containing data from p
+///      p2->AcquireData(p, false);
+/// At this point p.total_allocated_bytes_ would be 0.
+/// The one remaining (empty) chunk is released:
+///    delete p;
 
 class MemPool {
-public:
-    // Allocates mempool with fixed-size chunks of size 'chunk_size'.
-    // Chunk_size must be >= 0; 0 requests automatic doubling of chunk sizes,
-    // up to a limit.
-    // 'tracker' tracks the amount of memory allocated by this pool. Must not be NULL.
-    MemPool(MemTracker* mem_tracker, int chunk_size);
-    MemPool(MemTracker* mem_tracker) : MemPool::MemPool(mem_tracker, 0) {}
+ public:
 
-    // Frees all chunks of memory and subtracts the total allocated bytes
-    // from the registered limits.
-    ~MemPool();
+  /// 'tracker' tracks the amount of memory allocated by this pool. Must not be NULL.
+  MemPool(MemTracker* mem_tracker);
 
-    // Allocates 8-byte aligned section of memory of 'size' bytes at the end
-    // of the the current chunk. Creates a new chunk if there aren't any chunks
-    // with enough capacity.
-    uint8_t* allocate(int size) {
-        return allocate(size);
-    }
+  /// Frees all chunks of memory and subtracts the total allocated bytes
+  /// from the registered limits.
+  ~MemPool();
 
-    // Same as Allocate() except the mem limit is checked before the allocation and
-    // this call will fail (returns NULL) if it does.
-    // The caller must handle the NULL case. This should be used for allocations
-    // where the size can be very big to bound the amount by which we exceed mem limits.
-    uint8_t* try_allocate(int size) {
-        return allocate(size);
-    }
+  /// Allocates a section of memory of 'size' bytes with DEFAULT_ALIGNMENT at the end
+  /// of the the current chunk. Creates a new chunk if there aren't any chunks
+  /// with enough capacity.
+  uint8_t* allocate(int64_t size) {
+    return allocate(size, DEFAULT_ALIGNMENT);
+  }
 
-    // Returns 'byte_size' to the current chunk back to the mem pool. This can
-    // only be used to return either all or part of the previous allocation returned
-    // by Allocate().
-    void return_partial_allocation(int byte_size) {
-        DCHECK_GE(byte_size, 0);
-        DCHECK(_current_chunk_idx != -1);
-        ChunkInfo& info = _chunks[_current_chunk_idx];
-        DCHECK_GE(info.allocated_bytes, byte_size);
-        info.allocated_bytes -= byte_size;
-        _total_allocated_bytes -= byte_size;
-    }
+  /// Same as Allocate() except the mem limit is checked before the allocation and
+  /// this call will fail (returns NULL) if it does.
+  /// The caller must handle the NULL case. This should be used for allocations
+  /// where the size can be very big to bound the amount by which we exceed mem limits.
+  uint8_t* try_allocate(int64_t size) {
+    return allocate(size, DEFAULT_ALIGNMENT);
+  }
 
-    // Makes all allocated chunks available for re-use, but doesn't delete any chunks.
-    void clear() {
-        _current_chunk_idx = -1;
-        for (std::vector::iterator chunk = _chunks.begin();
-                chunk != _chunks.end(); ++chunk) {
-            chunk->cumulative_allocated_bytes = 0;
-            chunk->allocated_bytes = 0;
-        }
-        _total_allocated_bytes = 0;
-        DCHECK(check_integrity(false));
-    }
+  /// Same as TryAllocate() except a non-default alignment can be specified. It
+  /// should be a power-of-two in [1, alignof(std::max_align_t)].
+  uint8_t* try_allocate_aligned(int64_t size, int alignment) {
+    DCHECK_GE(alignment, 1);
+    DCHECK_LE(alignment, config::FLAGS_MEMORY_MAX_ALIGNMENT);
+    //DCHECK_LE(alignment, config::FLAGS_MEMORY_MAX_ALIGNMENT);
+    DCHECK_EQ(BitUtil::RoundUpToPowerOfTwo(alignment), alignment);
+    return allocate(size, alignment);
+  }
 
-    // Deletes all allocated chunks. FreeAll() or AcquireData() must be called for
-    // each mem pool
-    void free_all();
+  /// Same as TryAllocate() except returned memory is not aligned at all.
+  uint8_t* try_allocate_unaligned(int64_t size) {
+    // Call templated implementation directly so that it is inlined here and the
+    // alignment logic can be optimised out.
+    return allocate(size, 1);
+  }
 
-    // Absorb all chunks that hold data from src. If keep_current is true, let src hold on
-    // to its last allocated chunk that contains data.
-    // All offsets handed out by calls to get_offset()/get_current_offset() for 'src'
-    // become invalid.
-    // All offsets handed out by calls to GetCurrentOffset() for 'src' become invalid.
-    void acquire_data(MemPool* src, bool keep_current);
+  /// Returns 'byte_size' to the current chunk back to the mem pool. This can
+  /// only be used to return either all or part of the previous allocation returned
+  /// by Allocate().
+  void return_partial_allocation(int64_t byte_size) {
+    DCHECK_GE(byte_size, 0);
+    DCHECK(current_chunk_idx_ != -1);
+    ChunkInfo& info = chunks_[current_chunk_idx_];
+    DCHECK_GE(info.allocated_bytes, byte_size);
+    info.allocated_bytes -= byte_size;
+    ASAN_POISON_MEMORY_REGION(info.data + info.allocated_bytes, byte_size);
+    total_allocated_bytes_ -= byte_size;
+  }
 
-    // Diagnostic to check if memory is allocated from this mempool.
-    // Inputs:
-    //   ptr: start of memory block.
-    //   size: size of memory block.
-    // Returns true if memory block is in one of the chunks in this mempool.
-    bool contains(uint8_t* ptr, int size);
+  /// Return a dummy pointer for zero-length allocations.
+  static uint8_t* empty_alloc_ptr() {
+    return reinterpret_cast(&zero_length_region_);
+  }
 
-    std::string debug_string();
+  /// Makes all allocated chunks available for re-use, but doesn't delete any chunks.
+  void clear();
 
-    int64_t total_allocated_bytes() const {
-        return _total_allocated_bytes;
-    }
-    // int64_t total_chunk_bytes() const {
-    //     return _total_chunk_bytes;
-    // }
-    int64_t peak_allocated_bytes() const {
-        return _peak_allocated_bytes;
-    }
-    int64_t total_reserved_bytes() const {
-        return _total_reserved_bytes;
-    }
-    MemTracker* mem_tracker() {
-        return _mem_tracker;
-    }
+  /// Deletes all allocated chunks. FreeAll() or AcquireData() must be called for
+  /// each mem pool
+  void free_all();
 
-    // Return sum of _chunk_sizes.
-    int64_t get_total_chunk_sizes() const;
+  /// Absorb all chunks that hold data from src. If keep_current is true, let src hold on
+  /// to its last allocated chunk that contains data.
+  /// All offsets handed out by calls to GetCurrentOffset() for 'src' become invalid.
+  void acquire_data(MemPool* src, bool keep_current);
 
-    // Return logical offset of memory returned by next call to allocate()
-    // into allocated data.
-    int get_current_offset() const {
-        return _total_allocated_bytes;
-    }
+  std::string DebugString();
 
-    // Return (data ptr, allocated bytes) pairs for all chunks owned by this mempool.
-    void get_chunk_info(std::vector >* chunk_info);
+  int64_t total_allocated_bytes() const { return total_allocated_bytes_; }
+  int64_t total_reserved_bytes() const { return total_reserved_bytes_; }
+  int64_t peak_allocated_bytes() const { return peak_allocated_bytes_;}
 
-    // Print allocated bytes from all chunks.
-    std::string debug_print();
+  MemTracker* mem_tracker() { return mem_tracker_; }
 
-    // TODO: make a macro for doing this
-    // For C++/IR interop, we need to be able to look up types by name.
-    static const char* _s_llvm_class_name;
+  /// Return sum of chunk_sizes_.
+  int64_t get_total_chunk_sizes() const;
 
-private:
-    static const int DEFAULT_INITIAL_CHUNK_SIZE = 4 * 1024;
-    static const int64_t MAX_CHUNK_SIZE = 512 * 1024;
+  /// TODO: make a macro for doing this
+  /// For C++/IR interop, we need to be able to look up types by name.
+  static const char* LLVM_CLASS_NAME;
 
-    struct ChunkInfo {
-        bool owns_data;  // true if we eventually need to dealloc data
-        uint8_t* data;
-        int64_t size;  // in bytes
+  static const int DEFAULT_ALIGNMENT = 8;
 
-        // number of bytes allocated via allocate() up to but excluding this chunk;
-        // *not* valid for chunks > _current_chunk_idx (because that would create too
-        // much maintenance work if we have trailing unoccupied chunks)
-        int64_t cumulative_allocated_bytes;
+ private:
+  friend class MemPoolTest;
+  static const int INITIAL_CHUNK_SIZE = 4 * 1024;
 
-        // bytes allocated via allocate() in this chunk
-        int64_t allocated_bytes;
+  /// The maximum size of chunk that should be allocated. Allocations larger than this
+  /// size will get their own individual chunk.
+  static const int MAX_CHUNK_SIZE = 1024 * 1024;
 
-        // explicit ChunkInfo(int size);
-        explicit ChunkInfo(int64_t size, uint8_t* buf);
+  struct ChunkInfo {
+    uint8_t* data; // Owned by the ChunkInfo.
+    int64_t size;  // in bytes
 
-        ChunkInfo()
-            : owns_data(true),
-              data(NULL),
-              size(0),
-              cumulative_allocated_bytes(0),
-              allocated_bytes(0) {}
-    };
+    /// bytes allocated via Allocate() in this chunk
+    int64_t allocated_bytes;
 
-    // static uint32_t _s_zero_length_region alignas(max_align_t);
-    static uint32_t _s_zero_length_region;
+    explicit ChunkInfo(int64_t size, uint8_t* buf);
 
-    // chunk from which we served the last allocate() call;
-    // always points to the last chunk that contains allocated data;
-    // chunks 0.._current_chunk_idx are guaranteed to contain data
-    // (_chunks[i].allocated_bytes > 0 for i: 0.._current_chunk_idx);
-    // -1 if no chunks present
-    int _current_chunk_idx;
+    ChunkInfo()
+      : data(NULL),
+        size(0),
+        allocated_bytes(0) {}
+  };
 
-    // chunk where last offset conversion (get_offset() or get_data_ptr()) took place;
-    // -1 if those functions have never been called
-    int _last_offset_conversion_chunk_idx;
+  /// A static field used as non-NULL pointer for zero length allocations. NULL is
+  /// reserved for allocation failures. It must be as aligned as max_align_t for
+  /// TryAllocateAligned().
+  static uint32_t zero_length_region_;
 
-    int _chunk_size;  // if != 0, use this size for new chunks
+  /// chunk from which we served the last Allocate() call;
+  /// always points to the last chunk that contains allocated data;
+  /// chunks 0..current_chunk_idx_ - 1 are guaranteed to contain data
+  /// (chunks_[i].allocated_bytes > 0 for i: 0..current_chunk_idx_ - 1);
+  /// chunks after 'current_chunk_idx_' are "free chunks" that contain no data.
+  /// -1 if no chunks present
+  int current_chunk_idx_;
 
-    // sum of _allocated_bytes
-    int64_t _total_allocated_bytes;
+  /// The size of the next chunk to allocate.
+  int next_chunk_size_;
 
-    // sum of _total_chunk_bytes
-    // int64_t _total_chunk_bytes;
+  /// sum of allocated_bytes_
+  int64_t total_allocated_bytes_;
 
-    // Maximum number of bytes allocated from this pool at one time.
-    int64_t _peak_allocated_bytes;
+  /// sum of all bytes allocated in chunks_
+  int64_t total_reserved_bytes_;
 
-    // sum of all bytes allocated in _chunks
-    int64_t _total_reserved_bytes;
+  /// Maximum number of bytes allocated from this pool at one time.
+  int64_t peak_allocated_bytes_;
 
-    std::vector _chunks;
+  std::vector chunks_;
 
-    // std::vector _limits;
+  /// The current and peak memory footprint of this pool. This is different from
+  /// total allocated_bytes_ since it includes bytes in chunks that are not used.
+  MemTracker* mem_tracker_;
 
-    // // true if one of the registered limits was exceeded during an allocate()
-    // // call
-    // bool _exceeded_limit;
+  /// Find or allocated a chunk with at least min_size spare capacity and update
+  /// current_chunk_idx_. Also updates chunks_, chunk_sizes_ and allocated_bytes_
+  /// if a new chunk needs to be created.
+  /// If check_limits is true, this call can fail (returns false) if adding a
+  /// new chunk exceeds the mem limits.
+  bool FindChunk(size_t min_size, bool check_limits);
 
-    // The current and peak memory footprint of this pool. This is different from
-    // total _allocated_bytes since it includes bytes in chunks that are not used.
-    MemTracker* _mem_tracker;
+  /// Check integrity of the supporting data structures; always returns true but DCHECKs
+  /// all invariants.
+  /// If 'check_current_chunk_empty' is true, checks that the current chunk contains no
+  /// data. Otherwise the current chunk can be either empty or full.
+  bool CheckIntegrity(bool check_current_chunk_empty);
 
-    // Find or allocated a chunk with at least min_size spare capacity and update
-    // _current_chunk_idx. Also updates _chunks, _chunk_sizes and _allocated_bytes
-    // if a new chunk needs to be created.
-    // If check_limits is true, this call can fail (returns false) if adding a
-    // new chunk exceeds the mem limits.
-    // bool find_chunk(int min_size);
-    bool find_chunk(int64_t min_size, bool check_limits);
+  /// Return offset to unoccupied space in current chunk.
+  int64_t GetFreeOffset() const {
+    if (current_chunk_idx_ == -1) return 0;
+    return chunks_[current_chunk_idx_].allocated_bytes;
+  }
 
-    // Check integrity of the supporting data structures; always returns true but DCHECKs
-    // all invariants.
-    // If 'current_chunk_empty' is false, checks that the current chunk contains data.
-    bool check_integrity(bool current_chunk_empty);
+  template 
+  uint8_t* ALWAYS_INLINE allocate(int64_t size, int alignment) {
+    DCHECK_GE(size, 0);
+    if (UNLIKELY(size == 0)) return reinterpret_cast(&zero_length_region_);
 
-    int get_offset_helper(uint8_t* data);
-    uint8_t* get_data_ptr_helper(int offset);
-
-    // Return offset to unoccpied space in current chunk.
-    int get_free_offset() const {
-        if (_current_chunk_idx == -1) {
-            return 0;
-        }
-        return _chunks[_current_chunk_idx].allocated_bytes;
-    }
-
-    template 
-    uint8_t* allocate(int size) {
-        if (size == 0) {
-            return (uint8_t*)&_s_zero_length_region;
-        }
-
-        // round up to nearest 8 bytes
-        // e.g. if size between 1 and 7, num_bytes will be 8
-        // int64_t num_bytes = (size + 8LL - 1) / 8 * 8;
-        int64_t num_bytes = BitUtil::round_up(size, 8);
-        if (_current_chunk_idx == -1
-                || (num_bytes + _chunks[_current_chunk_idx].allocated_bytes)
-                    > _chunks[_current_chunk_idx].size) {
-            // If we couldn't allocate a new chunk, return NULL.
-            if (UNLIKELY(!find_chunk(num_bytes, CHECK_LIMIT_FIRST))) {
-                return NULL;
-            }
-        }
-        ChunkInfo& info = _chunks[_current_chunk_idx];
-        uint8_t* result = info.data + info.allocated_bytes;
-        DCHECK_LE(info.allocated_bytes + num_bytes, info.size);
-        info.allocated_bytes += num_bytes;
-        _total_allocated_bytes += num_bytes;
-        DCHECK_LE(_current_chunk_idx, _chunks.size() - 1);
-        _peak_allocated_bytes = std::max(_total_allocated_bytes, _peak_allocated_bytes);
+    if (current_chunk_idx_ != -1) {
+      ChunkInfo& info = chunks_[current_chunk_idx_];
+      int64_t aligned_allocated_bytes = BitUtil::RoundUpToPowerOf2(
+          info.allocated_bytes, alignment);
+      if (aligned_allocated_bytes + size <= info.size) {
+        // Ensure the requested alignment is respected.
+        int64_t padding = aligned_allocated_bytes - info.allocated_bytes;
+        uint8_t* result = info.data + aligned_allocated_bytes;
+        ASAN_UNPOISON_MEMORY_REGION(result, size);
+        DCHECK_LE(info.allocated_bytes + size, info.size);
+        info.allocated_bytes += padding + size;
+        total_allocated_bytes_ += padding + size;
+        DCHECK_LE(current_chunk_idx_, chunks_.size() - 1);
         return result;
+      }
     }
+
+    // If we couldn't allocate a new chunk, return NULL. malloc() guarantees alignment
+    // of alignof(std::max_align_t), so we do not need to do anything additional to
+    // guarantee alignment.
+    //static_assert(
+        //INITIAL_CHUNK_SIZE >= config::FLAGS_MEMORY_MAX_ALIGNMENT, "Min chunk size too low");
+    if (UNLIKELY(!FindChunk(size, CHECK_LIMIT_FIRST))) return NULL;
+
+    ChunkInfo& info = chunks_[current_chunk_idx_];
+    uint8_t* result = info.data + info.allocated_bytes;
+    ASAN_UNPOISON_MEMORY_REGION(result, size);
+    DCHECK_LE(info.allocated_bytes + size, info.size);
+    info.allocated_bytes += size;
+    total_allocated_bytes_ += size;
+    DCHECK_LE(current_chunk_idx_, chunks_.size() - 1);
+    peak_allocated_bytes_ = std::max(total_allocated_bytes_, peak_allocated_bytes_);
+    return result;
+  }
 };
 
-} // end namespace palo
+// Stamp out templated implementations here so they're included in IR module
+template uint8_t* MemPool::allocate(int64_t size, int alignment);
+template uint8_t* MemPool::allocate(int64_t size, int alignment);
+}
 
 #endif
diff --git a/be/src/runtime/mem_tracker.cpp b/be/src/runtime/mem_tracker.cpp
index 8d71b5acf5..f8007dab0a 100644
--- a/be/src/runtime/mem_tracker.cpp
+++ b/be/src/runtime/mem_tracker.cpp
@@ -113,7 +113,11 @@ void MemTracker::Init() {
     DCHECK_EQ(_all_trackers[0], this);
 }
 
-void MemTracker::EnableReservationReporting(const ReservationTrackerCounters& counters) {
+// TODO chenhao , set MemTracker close state
+void MemTracker::close() {
+}
+
+void MemTracker::enable_reservation_reporting(const ReservationTrackerCounters& counters) {
     ReservationTrackerCounters* new_counters = new ReservationTrackerCounters(counters);
     _reservation_counters.store(new_counters);
 }
@@ -177,12 +181,15 @@ MemTracker* MemTracker::CreateQueryMemTracker(const TUniqueId& id,
     MemTracker* pool_tracker =
         ExecEnv::GetInstance()->pool_mem_trackers()->GetRequestPoolMemTracker(
                 pool_name, true);
+    return pool_tracker;
 }
 
 MemTracker::~MemTracker() {
     DCHECK_EQ(_consumption->current_value(), 0) << _label << "\n"
         << get_stack_trace() << "\n"
         << LogUsage("");
+    // TODO chenhao
+    //DCHECK(_closed) << _label;
     delete _reservation_counters.load();
 }
 
@@ -225,7 +232,12 @@ std::string MemTracker::LogUsage(const std::string& prefix, int64_t* logged_cons
         int64_t reservation = reservation_counters->peak_reservation->current_value();
         int64_t used_reservation =
             reservation_counters->peak_used_reservation->current_value();
-        int64_t reservation_limit = reservation_counters->reservation_limit->value();
+        int64_t reservation_limit = 0;
+        //TODO chenhao, reservation_limit is null when ReservationTracker 
+        // does't have reservation limit
+        if (reservation_counters->reservation_limit != nullptr) {
+             reservation_limit = reservation_counters->reservation_limit->value();
+        }
         ss << " BufferPoolUsed/Reservation="
             << PrettyPrinter::print(used_reservation, TUnit::BYTES) << "/"
             << PrettyPrinter::print(reservation, TUnit::BYTES);
diff --git a/be/src/runtime/mem_tracker.h b/be/src/runtime/mem_tracker.h
index f833cf9e2a..97061f8adc 100644
--- a/be/src/runtime/mem_tracker.h
+++ b/be/src/runtime/mem_tracker.h
@@ -28,9 +28,10 @@
 #include 
 
 #include "gen_cpp/Types_types.h"
-#include 
+#include "util/metrics.h"
 #include "util/runtime_profile.h"
 #include "util/spinlock.h"
+#include "common/status.h"
 
 namespace palo {
 
@@ -87,6 +88,13 @@ public:
 
     ~MemTracker();
 
+    /// Closes this MemTracker. After closing it is invalid to consume memory on this
+    /// tracker and the tracker's consumption counter (which may be owned by a
+    /// RuntimeProfile, not this MemTracker) can be safely destroyed. MemTrackers without
+    /// consumption metrics in the context of a daemon must always be closed.
+    /// Idempotent: calling multiple times has no effect.
+    void close();
+
     // Removes this tracker from _parent->_child_trackers.
     void unregister_from_parent() {
         DCHECK(_parent != NULL);
@@ -97,7 +105,7 @@ public:
 
     /// Include counters from a ReservationTracker in logs and other diagnostics.
     /// The counters should be owned by the fragment's RuntimeProfile.
-    void EnableReservationReporting(const ReservationTrackerCounters& counters);
+    void enable_reservation_reporting(const ReservationTrackerCounters& counters);
 
     /// Construct a MemTracker object for query 'id'. The query limits are determined based
     /// on 'query_options'. The MemTracker is a child of the request pool MemTracker for
@@ -130,7 +138,7 @@ public:
         }
         for (std::vector::iterator tracker = _all_trackers.begin();
              tracker != _all_trackers.end(); ++tracker) {
-            (*tracker)->_consumption->Add(bytes);
+            (*tracker)->_consumption->add(bytes);
             if ((*tracker)->_consumption_metric == NULL) {
                 DCHECK_GE((*tracker)->_consumption->current_value(), 0);
             }
@@ -147,7 +155,7 @@ public:
         for (int i = 0; i < _all_trackers.size(); ++i) {
             if (_all_trackers[i] == end_tracker) return;
             DCHECK(!_all_trackers[i]->has_limit());
-            _all_trackers[i]->_consumption->Add(bytes);
+            _all_trackers[i]->_consumption->add(bytes);
         }
         DCHECK(false) << "end_tracker is not an ancestor";
     }
@@ -170,13 +178,13 @@ public:
             MemTracker* tracker = _all_trackers[i];
             const int64_t limit = tracker->limit();
             if (limit < 0) {
-                tracker->_consumption->Add(bytes); // No limit at this tracker.
+                tracker->_consumption->add(bytes); // No limit at this tracker.
             } else {
                 // If TryConsume fails, we can try to GC, but we may need to try several times if
                 // there are concurrent consumers because we don't take a lock before trying to
                 // update _consumption.
                 while (true) {
-                    if (LIKELY(tracker->_consumption->TryAdd(bytes, limit))) break;
+                    if (LIKELY(tracker->_consumption->try_add(bytes, limit))) break;
 
                     VLOG_RPC << "TryConsume failed, bytes=" << bytes
                         << " consumption=" << tracker->_consumption->current_value()
@@ -185,7 +193,7 @@ public:
                         DCHECK_GE(i, 0);
                         // Failed for this mem tracker. Roll back the ones that succeeded.
                         for (int j = _all_trackers.size() - 1; j > i; --j) {
-                            _all_trackers[j]->_consumption->Add(-bytes);
+                            _all_trackers[j]->_consumption->add(-bytes);
                         }
                         return false;
                     }
@@ -213,7 +221,7 @@ public:
         }
         for (std::vector::iterator tracker = _all_trackers.begin();
              tracker != _all_trackers.end(); ++tracker) {
-            (*tracker)->_consumption->Add(-bytes);
+            (*tracker)->_consumption->add(-bytes);
             /// If a UDF calls FunctionContext::TrackAllocation() but allocates less than the
             /// reported amount, the subsequent call to FunctionContext::Free() may cause the
             /// process mem tracker to go negative until it is synced back to the tcmalloc
@@ -258,7 +266,7 @@ public:
     void RefreshConsumptionFromMetric() {
         DCHECK(_consumption_metric != nullptr);
         DCHECK(_parent == nullptr);
-        _consumption->Set(_consumption_metric->value());
+        _consumption->set(_consumption_metric->value());
     }
 
 
@@ -324,7 +332,7 @@ public:
 
     /// Register this MemTracker's metrics. Each key will be of the form
     /// ".".
-    void RegisterMetrics(MetricGroup* metrics, const std::string& prefix);
+    void RegisterMetrics(MetricRegistry* metrics, const std::string& prefix);
 
     /// Logs the usage of this tracker and all of its children (recursively).
     /// If 'logged_consumption' is non-NULL, sets the consumption value logged.
@@ -340,6 +348,8 @@ public:
     Status MemLimitExceeded(RuntimeState* state, const std::string& details,
             int64_t failed_allocation = 0);
 
+    static const int UNLIMITED_DEPTH = INT_MAX;
+
     static const std::string COUNTER_NAME;
 
     static void update_limits(int64_t bytes, std::vector* limits) {
@@ -373,6 +383,10 @@ public:
         return msg.str();
     }
 
+    bool is_consumption_metric_null() {
+        return _consumption_metric == nullptr;
+    }
+    
 private:
     friend class PoolMemTrackerRegistry;
 
diff --git a/be/src/runtime/merge_sorter.cpp b/be/src/runtime/merge_sorter.cpp
index 5bb7964046..b462430666 100644
--- a/be/src/runtime/merge_sorter.cpp
+++ b/be/src/runtime/merge_sorter.cpp
@@ -231,10 +231,10 @@ private:
         int64_t _index;
 
         // Pointer to the current tuple.
-        uint8_t* _current_tuple;
+        uint8_t* _current_tuple = nullptr;
 
         // Start of the buffer containing current tuple.
-        uint8_t* _buffer_start;
+        uint8_t* _buffer_start = nullptr;
 
         // Index into _run._fixed_len_blocks of the block containing the current tuple.
         int _block_index;
diff --git a/be/src/runtime/plan_fragment_executor.cpp b/be/src/runtime/plan_fragment_executor.cpp
index 654a9a25c3..c5d80fc1a3 100644
--- a/be/src/runtime/plan_fragment_executor.cpp
+++ b/be/src/runtime/plan_fragment_executor.cpp
@@ -33,6 +33,7 @@
 #include "exec/exchange_node.h"
 #include "exec/scan_node.h"
 #include "exprs/expr.h"
+#include "runtime/exec_env.h"
 #include "runtime/descriptors.h"
 #include "runtime/data_stream_mgr.h"
 #include "runtime/result_buffer_mgr.h"
@@ -42,6 +43,7 @@
 #include "util/debug_util.h"
 #include "util/container_util.hpp"
 #include "util/parse_util.h"
+#include "util/pretty_printer.h"
 #include "util/mem_info.h"
 
 namespace palo {
@@ -142,7 +144,7 @@ Status PlanFragmentExecutor::prepare(const TExecPlanFragmentParams& request) {
     // set up plan
     DCHECK(request.__isset.fragment);
     RETURN_IF_ERROR(
-            ExecNode::create_tree(obj_pool(), request.fragment.plan, *desc_tbl, &_plan));
+            ExecNode::create_tree(_runtime_state.get(), obj_pool(), request.fragment.plan, *desc_tbl, &_plan));
     _runtime_state->set_fragment_root_id(_plan->id());
 
     if (request.params.__isset.debug_node_id) {
@@ -163,8 +165,8 @@ Status PlanFragmentExecutor::prepare(const TExecPlanFragmentParams& request) {
         static_cast(exch_node)->set_num_senders(num_senders);
     }
 
+ 
     RETURN_IF_ERROR(_plan->prepare(_runtime_state.get()));
-
     // set scan ranges
     std::vector scan_nodes;
     std::vector no_scan_ranges;
@@ -335,11 +337,6 @@ Status PlanFragmentExecutor::open_internal() {
         Status status = _sink->close(runtime_state(), _status);
         RETURN_IF_ERROR(status);
     }
-    {
-        std::stringstream ss;
-        profile()->pretty_print(&ss);
-        LOG(INFO) << ss.str();
-    }
 
     // Setting to NULL ensures that the d'tor won't double-close the sink.
     _sink.reset(NULL);
@@ -500,8 +497,8 @@ void PlanFragmentExecutor::cancel() {
     LOG(INFO) << "cancel(): instance_id=" << _runtime_state->fragment_instance_id();
     DCHECK(_prepared);
     _runtime_state->set_is_cancelled(true);
-    _runtime_state->stream_mgr()->cancel(_runtime_state->fragment_instance_id());
-    _runtime_state->result_mgr()->cancel(_runtime_state->fragment_instance_id());
+    _runtime_state->exec_env()->stream_mgr()->cancel(_runtime_state->fragment_instance_id());
+    _runtime_state->exec_env()->result_mgr()->cancel(_runtime_state->fragment_instance_id());
 }
 
 const RowDescriptor& PlanFragmentExecutor::row_desc() {
@@ -529,16 +526,29 @@ void PlanFragmentExecutor::close() {
 
     // Prepare may not have been called, which sets _runtime_state
     if (_runtime_state.get() != NULL) {
-        _plan->close(_runtime_state.get());
+        
+        // _runtime_state init failed
+        if (_plan != nullptr) {
+            _plan->close(_runtime_state.get());
+        }
 
         if (_sink.get() != NULL) {
             _sink->close(runtime_state(), _status);
         }
 
         _exec_env->thread_mgr()->unregister_pool(_runtime_state->resource_pool());
-    }
 
-    _mem_tracker->release(_mem_tracker->consumption());
+        {
+            std::stringstream ss;
+            profile()->pretty_print(&ss);
+            LOG(INFO) << ss.str();
+        }
+    }
+     
+    // _mem_tracker init failed
+    if (_mem_tracker.get() != nullptr) {
+        _mem_tracker->release(_mem_tracker->consumption());
+    }
     _closed = true;
 }
 
diff --git a/be/src/runtime/pull_load_task_mgr.cpp b/be/src/runtime/pull_load_task_mgr.cpp
index c2d82275e6..77e1d6fffb 100644
--- a/be/src/runtime/pull_load_task_mgr.cpp
+++ b/be/src/runtime/pull_load_task_mgr.cpp
@@ -92,10 +92,6 @@ Status PullLoadTaskCtx::add_sub_task_info(
         _task_info.file_map.emplace(it.first, it.second);
     }
 
-    // TODO(zc): counters
-    for (auto& it : sub_task_info.counters) {
-    }
-
     if (sub_task_info.__isset.tracking_url) {
         _task_info.tracking_urls.push_back(sub_task_info.tracking_url);
     }
diff --git a/be/src/runtime/raw_value.cpp b/be/src/runtime/raw_value.cpp
index 3c5396e6e1..c06b7487f2 100644
--- a/be/src/runtime/raw_value.cpp
+++ b/be/src/runtime/raw_value.cpp
@@ -25,6 +25,7 @@
 #include "runtime/string_value.hpp"
 #include "runtime/tuple.h"
 #include "olap/utils.h"
+#include "util/types.h"
 
 namespace palo {
 
@@ -164,7 +165,7 @@ void RawValue::print_value(const void* value, const TypeDescriptor& type, int sc
         break;
 
     case TYPE_LARGEINT:
-        *stream << *reinterpret_cast(value);
+        *stream << reinterpret_cast(value)->value;
         break;
 
     default:
@@ -247,7 +248,7 @@ void RawValue::write(const void* value, void* dst, const TypeDescriptor& type, M
     }
 
     case TYPE_LARGEINT: {
-        *reinterpret_cast<__int128*>(dst) = *reinterpret_cast(value);
+        *reinterpret_cast(dst) = *reinterpret_cast(value);
         break;
     }
 
@@ -314,7 +315,7 @@ void RawValue::write(const void* value, const TypeDescriptor& type, void* dst, u
             *reinterpret_cast(dst) = *reinterpret_cast(value);
             break;
         case TYPE_LARGEINT:
-            *reinterpret_cast<__int128*>(dst) = *reinterpret_cast(value);
+            *reinterpret_cast(dst) = *reinterpret_cast(value);
             break;
         case TYPE_FLOAT:
             *reinterpret_cast(dst) = *reinterpret_cast(value);
diff --git a/be/src/runtime/raw_value.h b/be/src/runtime/raw_value.h
index 7701fd1959..8062343a24 100644
--- a/be/src/runtime/raw_value.h
+++ b/be/src/runtime/raw_value.h
@@ -29,6 +29,7 @@
 #include "runtime/types.h"
 #include "runtime/string_value.h"
 #include "util/hash_util.hpp"
+#include "util/types.h"
 
 namespace palo {
 
@@ -170,8 +171,8 @@ inline bool RawValue::lt(const void* v1, const void* v2, const TypeDescriptor& t
                *reinterpret_cast(v2);
 
     case TYPE_LARGEINT:
-        return *reinterpret_cast(v1) <
-               *reinterpret_cast(v2);
+        return reinterpret_cast(v1)->value <
+               reinterpret_cast(v2)->value;
 
     default:
         DCHECK(false) << "invalid type: " << type;
@@ -228,8 +229,8 @@ inline bool RawValue::eq(const void* v1, const void* v2, const TypeDescriptor& t
                *reinterpret_cast(v2);
 
     case TYPE_LARGEINT:
-        return *reinterpret_cast(v1) ==
-               *reinterpret_cast(v2);
+        return reinterpret_cast(v1)->value ==
+               reinterpret_cast(v2)->value;
 
     default:
         DCHECK(false) << "invalid type: " << type;
@@ -282,7 +283,7 @@ inline uint32_t RawValue::get_hash_value(
 
     case TYPE_DATE:
     case TYPE_DATETIME:
-        return HashUtil::hash(v, 12, seed);
+        return HashUtil::hash(v, 16, seed);
 
     case TYPE_DECIMAL:
         return HashUtil::hash(v, 40, seed);
@@ -337,7 +338,7 @@ inline uint32_t RawValue::get_hash_value_fvn(
 
     case TYPE_DATE:
     case TYPE_DATETIME:
-        return HashUtil::fnv_hash(v, 12, seed);
+        return HashUtil::fnv_hash(v, 16, seed);
 
     case TYPE_DECIMAL:
         return ((DecimalValue *) v)->hash(seed);
diff --git a/be/src/runtime/raw_value_ir.cpp b/be/src/runtime/raw_value_ir.cpp
index 8135657902..07b1ff28be 100644
--- a/be/src/runtime/raw_value_ir.cpp
+++ b/be/src/runtime/raw_value_ir.cpp
@@ -19,6 +19,7 @@
 // under the License.
 
 #include "runtime/raw_value.h"
+#include "util/types.h"
 
 namespace palo {
 
@@ -29,8 +30,6 @@ int RawValue::compare(const void* v1, const void* v2, const TypeDescriptor& type
     const DateTimeValue* ts_value2;
     const DecimalValue* decimal_value1;
     const DecimalValue* decimal_value2;
-    const __int128* large_int_value1;
-    const __int128* large_int_value2;
     float f1 = 0;
     float f2 = 0;
     double d1 = 0;
@@ -103,11 +102,12 @@ int RawValue::compare(const void* v1, const void* v2, const TypeDescriptor& type
         return (*decimal_value1 > *decimal_value2)
                 ? 1 : (*decimal_value1 < *decimal_value2 ? -1 : 0);
 
-    case TYPE_LARGEINT:
-        large_int_value1 = reinterpret_cast(v1);
-        large_int_value2 = reinterpret_cast(v2);
-        return *large_int_value1 > *large_int_value2 ? 1 : 
-                (*large_int_value1 < *large_int_value2 ? -1 : 0);
+    case TYPE_LARGEINT: {
+        __int128 large_int_value1 = reinterpret_cast(v1)->value;
+        __int128 large_int_value2 = reinterpret_cast(v2)->value;
+        return large_int_value1 > large_int_value2 ? 1 : 
+                (large_int_value1 < large_int_value2 ? -1 : 0);
+    }
 
     default:
         DCHECK(false) << "invalid type: " << type.type;
diff --git a/be/src/runtime/result_writer.cpp b/be/src/runtime/result_writer.cpp
index bbc7a99173..2cb8712aa0 100644
--- a/be/src/runtime/result_writer.cpp
+++ b/be/src/runtime/result_writer.cpp
@@ -27,6 +27,7 @@
 #include "runtime/result_buffer_mgr.h"
 #include "runtime/buffer_control_block.h"
 #include "util/mysql_row_buffer.h"
+#include "util/types.h"
 
 #include "gen_cpp/PaloInternalService_types.h"
 
@@ -90,10 +91,10 @@ Status ResultWriter::add_one_row(TupleRow* row) {
             break;
 
         case TYPE_LARGEINT: {
-            const __int128* large_int_val = reinterpret_cast(item);
             char buf[48];
             int len = 48;
-            char* v = LargeIntValue::to_string(*large_int_val, buf, &len);
+            char* v = LargeIntValue::to_string(
+                reinterpret_cast(item)->value, buf, &len);
             buf_ret = _row_buffer->push_string(v, len);
             break;
         }
diff --git a/be/src/runtime/row_batch.cpp b/be/src/runtime/row_batch.cpp
index 9dc7a25b35..39b878cd87 100644
--- a/be/src/runtime/row_batch.cpp
+++ b/be/src/runtime/row_batch.cpp
@@ -23,11 +23,14 @@
 #include   // for intptr_t
 #include 
 
+#include "runtime/exec_env.h"
 #include "runtime/runtime_state.h"
 #include "runtime/string_value.h"
 #include "runtime/tuple_row.h"
 #include "runtime/buffered_tuple_stream2.inline.h"
+//#include "runtime/mem_tracker.h"
 #include "gen_cpp/Data_types.h"
+#include "gen_cpp/data.pb.h"
 
 using std::vector;
 
@@ -53,7 +56,7 @@ RowBatch::RowBatch(const RowDescriptor& row_desc, int capacity, MemTracker* mem_
     _tuple_ptrs_size = _capacity * _num_tuples_per_row * sizeof(Tuple*);
     DCHECK_GT(_tuple_ptrs_size, 0);
     // TODO: switch to Init() pattern so we can check memory limit and return Status.
-    if (config::enable_partitioned_aggregation) {
+    if (config::enable_partitioned_aggregation || config::enable_new_partitioned_aggregation) {
         _mem_tracker->consume(_tuple_ptrs_size);
         _tuple_ptrs = reinterpret_cast(malloc(_tuple_ptrs_size));
         DCHECK(_tuple_ptrs != NULL);
@@ -62,6 +65,102 @@ RowBatch::RowBatch(const RowDescriptor& row_desc, int capacity, MemTracker* mem_
     }
 }
 
+// TODO: we want our input_batch's tuple_data to come from our (not yet implemented)
+// global runtime memory segment; how do we get thrift to allocate it from there?
+// maybe change line (in Data_types.cc generated from Data.thrift)
+//              xfer += iprot->readString(this->tuple_data[_i9]);
+// to allocated string data in special mempool
+// (change via python script that runs over Data_types.cc)
+RowBatch::RowBatch(const RowDescriptor& row_desc,
+                   const PRowBatch& input_batch,
+                   MemTracker* tracker)
+            : _mem_tracker(tracker),
+            _has_in_flight_row(false),
+            _num_rows(input_batch.num_rows()),
+            _capacity(_num_rows),
+            _flush(FlushMode::NO_FLUSH_RESOURCES),
+            _needs_deep_copy(false),
+            _num_tuples_per_row(input_batch.row_tuples_size()),
+            _row_desc(row_desc),
+            _auxiliary_mem_usage(0),
+            _need_to_return(false),
+            _tuple_data_pool(new MemPool(_mem_tracker)) {
+    DCHECK(_mem_tracker != nullptr);
+    _tuple_ptrs_size = _num_rows * _num_tuples_per_row * sizeof(Tuple*);
+    DCHECK_GT(_tuple_ptrs_size, 0);
+    // TODO: switch to Init() pattern so we can check memory limit and return Status.
+    if (config::enable_partitioned_aggregation || config::enable_new_partitioned_aggregation) {
+        _mem_tracker->consume(_tuple_ptrs_size);
+        _tuple_ptrs = reinterpret_cast(malloc(_tuple_ptrs_size));
+        DCHECK(_tuple_ptrs != nullptr);
+    } else {
+        _tuple_ptrs = reinterpret_cast(_tuple_data_pool->allocate(_tuple_ptrs_size));
+    }
+
+    uint8_t* tuple_data = nullptr;
+    if (input_batch.is_compressed()) {
+        // Decompress tuple data into data pool
+        const char* compressed_data = input_batch.tuple_data().c_str();
+        size_t compressed_size = input_batch.tuple_data().size();
+        size_t uncompressed_size = 0;
+        bool success = snappy::GetUncompressedLength(compressed_data, compressed_size,
+                       &uncompressed_size);
+        DCHECK(success) << "snappy::GetUncompressedLength failed";
+        tuple_data = reinterpret_cast(_tuple_data_pool->allocate(uncompressed_size));
+        success = snappy::RawUncompress(
+                compressed_data, compressed_size, reinterpret_cast(tuple_data));
+        DCHECK(success) << "snappy::RawUncompress failed";
+    } else {
+        // Tuple data uncompressed, copy directly into data pool
+        tuple_data = _tuple_data_pool->allocate(input_batch.tuple_data().size());
+        memcpy(tuple_data, input_batch.tuple_data().c_str(), input_batch.tuple_data().size());
+    }
+
+    // convert input_batch.tuple_offsets into pointers
+    int tuple_idx = 0;
+    for (auto offset : input_batch.tuple_offsets()) {
+        if (offset == -1) {
+            _tuple_ptrs[tuple_idx++] = nullptr;
+        } else {
+            _tuple_ptrs[tuple_idx++] = reinterpret_cast(tuple_data + offset);
+        }
+    }
+
+    // Check whether we have slots that require offset-to-pointer conversion.
+    if (!_row_desc.has_varlen_slots()) {
+        return;
+    }
+    const vector& tuple_descs = _row_desc.tuple_descriptors();
+
+    // For every unique tuple, convert string offsets contained in tuple data into
+    // pointers. Tuples were serialized in the order we are deserializing them in,
+    // so the first occurrence of a tuple will always have a higher offset than any tuple
+    // we already converted.
+    for (int i = 0; i < _num_rows; ++i) {
+        TupleRow* row = get_row(i);
+        vector::const_iterator desc = tuple_descs.begin();
+        for (int j = 0; desc != tuple_descs.end(); ++desc, ++j) {
+            if ((*desc)->string_slots().empty()) {
+                continue;
+            }
+
+            Tuple* tuple = row->get_tuple(j);
+            if (tuple == NULL) {
+                continue;
+            }
+
+            vector::const_iterator slot = (*desc)->string_slots().begin();
+            for (; slot != (*desc)->string_slots().end(); ++slot) {
+                DCHECK((*slot)->type().is_string_type());
+                StringValue* string_val = tuple->get_string_slot((*slot)->tuple_offset());
+
+                int offset = reinterpret_cast(string_val->ptr);
+                string_val->ptr = reinterpret_cast(tuple_data + offset);
+            }
+        }
+    }
+}
+
 // TODO: we want our input_batch's tuple_data to come from our (not yet implemented)
 // global runtime memory segment; how do we get thrift to allocate it from there?
 // maybe change line (in Data_types.cc generated from Data.thrift)
@@ -84,7 +183,7 @@ RowBatch::RowBatch(const RowDescriptor& row_desc, const TRowBatch& input_batch,
     _tuple_ptrs_size = _num_rows * input_batch.row_tuples.size() * sizeof(Tuple*);
     DCHECK_GT(_tuple_ptrs_size, 0);
     // TODO: switch to Init() pattern so we can check memory limit and return Status.
-    if (config::enable_partitioned_aggregation) {
+    if (config::enable_partitioned_aggregation || config::enable_new_partitioned_aggregation) {
         _mem_tracker->consume(_tuple_ptrs_size);
         _tuple_ptrs = reinterpret_cast(malloc(_tuple_ptrs_size));
         DCHECK(_tuple_ptrs != NULL);
@@ -128,14 +227,12 @@ RowBatch::RowBatch(const RowDescriptor& row_desc, const TRowBatch& input_batch,
     if (!_row_desc.has_varlen_slots()) {
         return;
     }
-    bool has_string_slots = _row_desc.has_varlen_slots();
     const vector& tuple_descs = _row_desc.tuple_descriptors();
 
     // For every unique tuple, convert string offsets contained in tuple data into
     // pointers. Tuples were serialized in the order we are deserializing them in,
     // so the first occurrence of a tuple will always have a higher offset than any tuple
     // we already converted.
-    Tuple* last_converted = NULL;
     for (int i = 0; i < _num_rows; ++i) {
         TupleRow* row = get_row(i);
         vector::const_iterator desc = tuple_descs.begin();
@@ -145,11 +242,6 @@ RowBatch::RowBatch(const RowDescriptor& row_desc, const TRowBatch& input_batch,
             }
 
             Tuple* tuple = row->get_tuple(j);
-            // Handle NULL or already converted tuples with one check.
-            // if (tuple <= last_converted) {
-            //     continue;
-            // }
-            last_converted = tuple;
             if (tuple == NULL) {
                 continue;
             }
@@ -175,11 +267,17 @@ void RowBatch::clear() {
     for (int i = 0; i < _io_buffers.size(); ++i) {
         _io_buffers[i]->return_buffer();
     }
+  
+    for (BufferInfo& buffer_info : _buffers) {
+        ExecEnv::GetInstance()->buffer_pool()->FreeBuffer(
+          buffer_info.client, &buffer_info.buffer);
+    }
+
     close_tuple_streams();
     for (int i = 0; i < _blocks.size(); ++i) {
         _blocks[i]->del();
     }
-    if (config::enable_partitioned_aggregation) {
+    if (config::enable_partitioned_aggregation || config::enable_new_partitioned_aggregation) {
         DCHECK(_tuple_ptrs != NULL);
         free(_tuple_ptrs);
         _mem_tracker->release(_tuple_ptrs_size);
@@ -259,6 +357,74 @@ int RowBatch::serialize(TRowBatch* output_batch) {
     return get_batch_size(*output_batch) - output_batch->tuple_data.size() + size;
 }
 
+int RowBatch::serialize(PRowBatch* output_batch) {
+    // num_rows
+    output_batch->set_num_rows(_num_rows);
+    // row_tuples
+    _row_desc.to_protobuf(output_batch->mutable_row_tuples());
+    // tuple_offsets: must clear before reserve
+    output_batch->clear_tuple_offsets();
+    output_batch->mutable_tuple_offsets()->Reserve(_num_rows * _num_tuples_per_row);
+    // is_compressed
+    output_batch->set_is_compressed(false);
+    // tuple data
+    int size = total_byte_size();
+    auto mutable_tuple_data = output_batch->mutable_tuple_data();
+    mutable_tuple_data->resize(size);
+
+    // Copy tuple data, including strings, into output_batch (converting string
+    // pointers into offsets in the process)
+    int offset = 0; // current offset into output_batch->tuple_data
+    char* tuple_data = const_cast(mutable_tuple_data->data());
+    for (int i = 0; i < _num_rows; ++i) {
+        TupleRow* row = get_row(i);
+        const vector& tuple_descs = _row_desc.tuple_descriptors();
+        vector::const_iterator desc = tuple_descs.begin();
+
+        for (int j = 0; desc != tuple_descs.end(); ++desc, ++j) {
+            if (row->get_tuple(j) == NULL) {
+                // NULLs are encoded as -1
+                output_batch->mutable_tuple_offsets()->Add(-1);
+                continue;
+            }
+
+            // Record offset before creating copy (which increments offset and tuple_data)
+            output_batch->mutable_tuple_offsets()->Add(offset);
+            row->get_tuple(j)->deep_copy(**desc, &tuple_data, &offset, /* convert_ptrs */ true);
+            DCHECK_LE(offset, size);
+        }
+    }
+
+    DCHECK_EQ(offset, size);
+
+    if (config::compress_rowbatches && size > 0) {
+        // Try compressing tuple_data to _compression_scratch, swap if compressed data is
+        // smaller
+        int max_compressed_size = snappy::MaxCompressedLength(size);
+
+        if (_compression_scratch.size() < max_compressed_size) {
+            _compression_scratch.resize(max_compressed_size);
+        }
+
+        size_t compressed_size = 0;
+        char* compressed_output = const_cast(_compression_scratch.c_str());
+        snappy::RawCompress(mutable_tuple_data->data(), size,
+                            compressed_output, &compressed_size);
+
+        if (LIKELY(compressed_size < size)) {
+            _compression_scratch.resize(compressed_size);
+            mutable_tuple_data->swap(_compression_scratch);
+            output_batch->set_is_compressed(true);
+        }
+
+        VLOG_ROW << "uncompressed size: " << size << ", compressed size: " << compressed_size;
+    }
+
+    // The size output_batch would be if we didn't compress tuple_data (will be equal to
+    // actual batch size if tuple_data isn't compressed)
+    return get_batch_size(*output_batch) - mutable_tuple_data->size() + size;
+}
+
 void RowBatch::add_io_buffer(DiskIoMgr::BufferDescriptor* buffer) {
     DCHECK(buffer != NULL);
     _io_buffers.push_back(buffer);
@@ -302,12 +468,19 @@ void RowBatch::reset() {
     _num_rows = 0;
     _capacity = _tuple_ptrs_size / (_num_tuples_per_row * sizeof(Tuple*));
     _has_in_flight_row = false;
+    
     // TODO: Change this to Clear() and investigate the repercussions.
     _tuple_data_pool->free_all();
     for (int i = 0; i < _io_buffers.size(); ++i) {
         _io_buffers[i]->return_buffer();
     }
     _io_buffers.clear();
+    
+    for (BufferInfo& buffer_info : _buffers) {
+      ExecEnv::GetInstance()->buffer_pool()->FreeBuffer(
+          buffer_info.client, &buffer_info.buffer);
+    }
+    _buffers.clear();
 
     close_tuple_streams();
     for (int i = 0; i < _blocks.size(); ++i) {
@@ -315,7 +488,7 @@ void RowBatch::reset() {
     }
     _blocks.clear();
     _auxiliary_mem_usage = 0;
-    if (!config::enable_partitioned_aggregation) {
+    if (!config::enable_partitioned_aggregation && !config::enable_new_partitioned_aggregation) {
         _tuple_ptrs = reinterpret_cast(_tuple_data_pool->allocate(_tuple_ptrs_size));
     }
     _need_to_return = false;
@@ -341,6 +514,13 @@ void RowBatch::transfer_resource_ownership(RowBatch* dest) {
         buffer->set_mem_tracker(dest->_mem_tracker);
     }
     _io_buffers.clear();
+
+    for (BufferInfo& buffer_info : _buffers) {
+        dest->add_buffer(
+            buffer_info.client, std::move(buffer_info.buffer), FlushMode::NO_FLUSH_RESOURCES);
+    }
+    _buffers.clear();
+
     for (int i = 0; i < _tuple_streams.size(); ++i) {
         dest->_tuple_streams.push_back(_tuple_streams[i]);
         dest->_auxiliary_mem_usage += _tuple_streams[i]->byte_size();
@@ -353,15 +533,15 @@ void RowBatch::transfer_resource_ownership(RowBatch* dest) {
     _blocks.clear();
     dest->_need_to_return |= _need_to_return;
     _auxiliary_mem_usage = 0;
-    if (!config::enable_partitioned_aggregation) {
+    if (!config::enable_partitioned_aggregation && !config::enable_new_partitioned_aggregation) {
         _tuple_ptrs = NULL;
     }
 
     if (_needs_deep_copy) {
-        dest->mark_needs_deep_copy();
+      dest->mark_needs_deep_copy();
     } else if (_flush == FlushMode::FLUSH_RESOURCES) {
-        dest->mark_flush_resources();
-    }
+      dest->mark_flush_resources();
+    } 
     reset();
 }
 
@@ -372,6 +552,13 @@ int RowBatch::get_batch_size(const TRowBatch& batch) {
     return result;
 }
 
+int RowBatch::get_batch_size(const PRowBatch& batch) {
+    int result = batch.tuple_data().size();
+    result += batch.row_tuples().size() * sizeof(int32_t);
+    result += batch.tuple_offsets().size() * sizeof(int32_t);
+    return result;
+}
+
 void RowBatch::acquire_state(RowBatch* src) {
     // DCHECK(_row_desc.equals(src->_row_desc));
     DCHECK_EQ(_num_tuples_per_row, src->_num_tuples_per_row);
@@ -398,7 +585,7 @@ void RowBatch::acquire_state(RowBatch* src) {
     _num_rows = src->_num_rows;
     _capacity = src->_capacity;
     _need_to_return = src->_need_to_return;
-    if (!config::enable_partitioned_aggregation) {
+    if (!config::enable_partitioned_aggregation && !config::enable_new_partitioned_aggregation) {
         // Tuple pointers are allocated from tuple_data_pool_ so are transferred.
         _tuple_ptrs = src->_tuple_ptrs;
         src->_tuple_ptrs = NULL;
@@ -421,7 +608,7 @@ void RowBatch::swap(RowBatch* other) {
     std::swap(_has_in_flight_row, other->_has_in_flight_row);
     std::swap(_num_rows, other->_num_rows);
     std::swap(_capacity, other->_capacity);
-    if (!config::enable_partitioned_aggregation) {
+    if (!config::enable_partitioned_aggregation && !config::enable_new_partitioned_aggregation) {
         // Tuple pointers are allocated from tuple_data_pool_ so are transferred.
         _tuple_ptrs = other->_tuple_ptrs;
         other->_tuple_ptrs = NULL;
@@ -476,4 +663,13 @@ int RowBatch::max_tuple_buffer_size() {
     return tuple_buffer_size;
 }
 
+void RowBatch::add_buffer(BufferPool::ClientHandle* client,
+      BufferPool::BufferHandle&& buffer, FlushMode flush) {
+    _auxiliary_mem_usage += buffer.len();
+    BufferInfo buffer_info;
+    buffer_info.client = client;
+    buffer_info.buffer = std::move(buffer);
+    _buffers.push_back(std::move(buffer_info));
+    if (flush == FlushMode::FLUSH_RESOURCES) mark_flush_resources();
+}
 } // end namespace palo
diff --git a/be/src/runtime/row_batch.h b/be/src/runtime/row_batch.h
index 3ed9cea8a5..23d2fd4f4c 100644
--- a/be/src/runtime/row_batch.h
+++ b/be/src/runtime/row_batch.h
@@ -29,6 +29,7 @@
 #include "codegen/palo_ir.h"
 #include "runtime/buffered_block_mgr2.h" // for BufferedBlockMgr2::Block
 // #include "runtime/buffered_tuple_stream2.inline.h"
+#include "runtime/bufferpool/buffer_pool.h"
 #include "runtime/disk_io_mgr.h"
 #include "runtime/descriptors.h"
 #include "runtime/mem_pool.h"
@@ -41,6 +42,7 @@ class TRowBatch;
 class Tuple;
 class TupleRow;
 class TupleDescriptor;
+class PRowBatch;
 
 // A RowBatch encapsulates a batch of rows, each composed of a number of tuples.
 // The maximum number of rows is fixed at the time of construction, and the caller
@@ -93,6 +95,8 @@ public:
     // (so that we don't need to make yet another copy)
     RowBatch(const RowDescriptor& row_desc, const TRowBatch& input_batch, MemTracker* tracker);
 
+    RowBatch(const RowDescriptor& row_desc, const PRowBatch& input_batch, MemTracker* tracker);
+
     // Releases all resources accumulated at this row batch.  This includes
     //  - tuple_ptrs
     //  - tuple mem pool data
@@ -261,6 +265,16 @@ public:
     // and will call Close() on the stream and delete it when freeing resources.
     void add_tuple_stream(BufferedTupleStream2* stream);
 
+    /// Adds a buffer to this row batch. The buffer is deleted when freeing resources.
+    /// The buffer's memory remains accounted against the original owner, even when the
+    /// ownership of batches is transferred. If the original owner wants the memory to be
+    /// released, it should call this with 'mode' FLUSH_RESOURCES (see MarkFlushResources()
+    /// for further explanation).
+    /// TODO: IMPALA-4179: after IMPALA-3200, simplify the ownership transfer model and
+    /// make it consistent between buffers and I/O buffers.
+    void add_buffer(BufferPool::ClientHandle* client, BufferPool::BufferHandle&& buffer,
+        FlushMode flush);
+
     // Adds a block to this row batch. The block must be pinned. The blocks must be
     // deleted when freeing resources.
     void add_block(BufferedBlockMgr2::Block* block);
@@ -357,9 +371,11 @@ public:
     // Returns the uncompressed serialized size (this will be the true size of output_batch
     // if tuple_data is actually uncompressed).
     int serialize(TRowBatch* output_batch);
+    int serialize(PRowBatch* output_batch);
 
     // Utility function: returns total size of batch.
     static int get_batch_size(const TRowBatch& batch);
+    static int get_batch_size(const PRowBatch& batch);
 
     int num_rows() const {
         return _num_rows;
@@ -368,6 +384,9 @@ public:
         return _capacity;
     }
 
+    int num_buffers() const { 
+        return _buffers.size(); 
+    }
     // Swaps all of the row batch state with 'other'.  This is used for scan nodes
     // which produce RowBatches asynchronously.  Typically, an ExecNode is handed
     // a row batch to populate (pull model) but ScanNodes have multiple threads
@@ -472,6 +491,12 @@ private:
     // (i.e. they are not ref counted) so most row batches don't own any.
     std::vector _io_buffers;
 
+    struct BufferInfo {
+        BufferPool::ClientHandle* client;
+        BufferPool::BufferHandle buffer;
+    };
+    /// Pages attached to this row batch. See AddBuffer() for ownership semantics.
+    std::vector _buffers;
     // Tuple streams currently owned by this row batch.
     std::vector _tuple_streams;
 
diff --git a/be/src/runtime/runtime_state.cpp b/be/src/runtime/runtime_state.cpp
index 24e0b673ab..d18d46ffdc 100644
--- a/be/src/runtime/runtime_state.cpp
+++ b/be/src/runtime/runtime_state.cpp
@@ -31,7 +31,10 @@
 #include "exprs/expr.h"
 #include "runtime/buffered_block_mgr.h"
 #include "runtime/buffered_block_mgr2.h"
+#include "runtime/bufferpool/reservation_util.h"
 #include "runtime/descriptors.h"
+#include "runtime/exec_env.h"
+#include "runtime/initial_reservations.h"
 #include "runtime/runtime_state.h"
 #include "runtime/load_path_mgr.h"
 #include "util/cpu_info.h"
@@ -39,7 +42,10 @@
 #include "util/debug_util.h"
 #include "util/disk_info.h"
 #include "util/file_utils.h"
+#include "util/pretty_printer.h"
 #include "util/mysql_load_error_hub.h"
+#include "runtime/mem_tracker.h"
+#include "runtime/bufferpool/reservation_tracker.h"
 
 namespace palo {
 
@@ -59,7 +65,8 @@ RuntimeState::RuntimeState(
             _num_rows_load_filtered(0),
             _normal_row_number(0),
             _error_row_number(0),
-            _error_log_file(nullptr) {
+            _error_log_file(nullptr),
+            _instance_buffer_reservation(new ReservationTracker) {
     Status status = init(fragment_instance_id, query_options, now, exec_env);
     DCHECK(status.ok());
 }
@@ -82,7 +89,8 @@ RuntimeState::RuntimeState(
             _num_rows_load_filtered(0),
             _normal_row_number(0),
             _error_row_number(0),
-            _error_log_file(nullptr) {
+            _error_log_file(nullptr),
+            _instance_buffer_reservation(new ReservationTracker) {
     Status status = init(fragment_params.params.fragment_instance_id, query_options, now, exec_env);
     DCHECK(status.ok());
 }
@@ -112,6 +120,19 @@ RuntimeState::~RuntimeState() {
         _error_hub->close();
     }
 
+    // Release the reservation, which should be unused at the point.
+    if (_instance_buffer_reservation != nullptr) {
+        _instance_buffer_reservation->Close();
+    }
+
+    if (_initial_reservations != nullptr) { 
+        _initial_reservations->ReleaseResources();
+    }
+    
+    if (_buffer_reservation != nullptr) {
+        _buffer_reservation->Close();
+    }
+
     // _query_mem_tracker must be valid as long as _instance_mem_tracker is so
     // delete _instance_mem_tracker first.
     // LogUsage() walks the MemTracker tree top-down when the memory limit is exceeded.
@@ -122,7 +143,13 @@ RuntimeState::~RuntimeState() {
         _instance_mem_tracker->unregister_from_parent();
     }
 
+    _instance_mem_tracker->close();
     _instance_mem_tracker.reset();
+   
+    if (_query_mem_tracker.get() != NULL) {
+        _query_mem_tracker->unregister_from_parent();
+    }
+    _query_mem_tracker->close();
     _query_mem_tracker.reset();
 }
 
@@ -165,7 +192,6 @@ Status RuntimeState::init(
 Status RuntimeState::init_mem_trackers(const TUniqueId& query_id) {
     bool has_query_mem_tracker = _query_options.__isset.mem_limit && (_query_options.mem_limit > 0);
     int64_t bytes_limit = has_query_mem_tracker ? _query_options.mem_limit : -1;
-
     // we do not use global query-map  for now, to avoid mem-exceeded different fragments
     // running on the same machine.
     // TODO(lingbin): open it later. note that open with BufferedBlcokMgr's BlockMgrsMap
@@ -184,10 +210,51 @@ Status RuntimeState::init_mem_trackers(const TUniqueId& query_id) {
         new MemTracker(-1, "UDFs", _instance_mem_tracker.get()));
     _udf_pool.reset(new MemPool(_udf_mem_tracker.get()));
     */
-    _udf_pool.reset(new MemPool(_instance_mem_tracker.get()));
+    // _udf_pool.reset(new MemPool(_instance_mem_tracker.get()));
+
+    RETURN_IF_ERROR(init_buffer_poolstate());
+
+    _initial_reservations = _obj_pool->add(new InitialReservations(_obj_pool.get(),
+                      _buffer_reservation, _query_mem_tracker.get(), 
+                      _query_options.initial_reservation_total_claims));
+    RETURN_IF_ERROR(
+        _initial_reservations->Init(_query_id, min_reservation()));
+    DCHECK_EQ(0, _initial_reservation_refcnt.load());
+
+    if (_instance_buffer_reservation != nullptr) {
+        _instance_buffer_reservation->InitChildTracker(&_profile,
+            _buffer_reservation, _instance_mem_tracker.get(),
+            std::numeric_limits::max());
+    } 
+
     return Status::OK;
 }
 
+Status RuntimeState::init_buffer_poolstate() {
+  ExecEnv* exec_env = ExecEnv::GetInstance();
+  int64_t mem_limit = _query_mem_tracker->lowest_limit();
+  int64_t max_reservation;
+  if (query_options().__isset.buffer_pool_limit
+      && query_options().buffer_pool_limit > 0) {
+    max_reservation = query_options().buffer_pool_limit;
+  } else if (mem_limit == -1) {
+    // No query mem limit. The process-wide reservation limit is the only limit on
+    // reservations.
+    max_reservation = std::numeric_limits::max();
+  } else {
+    DCHECK_GE(mem_limit, 0);
+    max_reservation = ReservationUtil::GetReservationLimitFromMemLimit(mem_limit);
+  }
+
+  VLOG_QUERY << "Buffer pool limit for " << print_id(_query_id) << ": " << max_reservation;
+
+  _buffer_reservation = _obj_pool->add(new ReservationTracker);
+  _buffer_reservation->InitChildTracker(
+      NULL, exec_env->buffer_reservation(), _query_mem_tracker.get(), max_reservation);
+  
+  return Status::OK;
+}
+
 Status RuntimeState::create_block_mgr() {
     DCHECK(_block_mgr.get() == NULL);
     DCHECK(_block_mgr2.get() == NULL);
@@ -200,7 +267,7 @@ Status RuntimeState::create_block_mgr() {
     }
     RETURN_IF_ERROR(BufferedBlockMgr2::create(this, _query_mem_tracker.get(),
             runtime_profile(), _exec_env->tmp_file_mgr(),
-            block_mgr_limit, io_mgr()->max_read_buffer_size(), &_block_mgr2));
+            block_mgr_limit, _exec_env->disk_io_mgr()->max_read_buffer_size(), &_block_mgr2));
     return Status::OK;
 }
 
@@ -412,5 +479,10 @@ Status RuntimeState::get_codegen(LlvmCodeGen** codegen) {
     return get_codegen(codegen, true);
 }
 
+// TODO chenhao , check scratch_limit, disable_spilling and file_group
+// before spillng
+Status RuntimeState::StartSpilling(MemTracker* mem_tracker) {
+    return Status("Mem limit exceeded.");
+}
 } // end namespace palo
 
diff --git a/be/src/runtime/runtime_state.h b/be/src/runtime/runtime_state.h
index adc614c9ba..e83c52e936 100644
--- a/be/src/runtime/runtime_state.h
+++ b/be/src/runtime/runtime_state.h
@@ -28,15 +28,14 @@
 #include 
 
 #include 
-#include 
-#include 
-// stringstream is a typedef, so can't forward declare it.
-#include 
 #include 
+#include 
+#include 
+#include 
+#include 
 
-#include "runtime/exec_env.h"
+#include "common/global_types.h"
 #include "util/logging.h"
-#include "runtime/descriptors.h"  // for PlanNodeId
 #include "runtime/mem_pool.h"
 #include "runtime/thread_resource_mgr.h"
 #include "gen_cpp/Types_types.h"  // for TUniqueId
@@ -61,6 +60,9 @@ class TmpFileMgr;
 class BufferedBlockMgr;
 class BufferedBlockMgr2;
 class LoadErrorHub;
+class ReservationTracker;
+class InitialReservations;
+class RowDescriptor;
 
 // A collection of items that are part of the global state of a
 // query and shared across all execution nodes of that query.
@@ -93,6 +95,9 @@ public:
     // This function also initializes a user function mem tracker (in the fourth level).
     Status init_mem_trackers(const TUniqueId& query_id);
 
+    /// Called from Init() to set up buffer reservations and the file group.
+    Status init_buffer_poolstate();
+
     // Gets/Creates the query wide block mgr.
     Status create_block_mgr();
 
@@ -151,21 +156,6 @@ public:
     ExecEnv* exec_env() {
         return _exec_env;
     }
-    DataStreamMgr* stream_mgr() {
-        return _exec_env->stream_mgr();
-    }
-    ResultBufferMgr* result_mgr() {
-        return _exec_env->result_mgr();
-    }
-    BackendServiceClientCache* client_cache() {
-        return _exec_env->client_cache();
-    }
-    FrontendServiceClientCache* frontend_client_cache() {
-        return _exec_env->frontend_client_cache();
-    }
-    DiskIoMgr* io_mgr() {
-        return _exec_env->disk_io_mgr();
-    }
     std::vector* mem_trackers() {
         return &_mem_trackers;
     }
@@ -193,10 +183,6 @@ public:
         return _root_node_id + 1;
     }
 
-    ThreadPool* etl_thread_pool() {
-        return _exec_env->etl_thread_pool();
-    }
-
     // Returns true if the codegen object has been created. Note that this may return false
     // even when codegen is enabled if nothing has been codegen'd.
     bool codegen_created() const {
@@ -245,9 +231,9 @@ public:
         return _process_status;
     };
 
-    MemPool* udf_pool() {
-        return _udf_pool.get();
-    };
+//    MemPool* udf_pool() {
+//        return _udf_pool.get();
+//    };
 
     // Create and return a stream receiver for _fragment_instance_id
     // from the data stream manager. The receiver is added to _data_stream_recvrs_pool.
@@ -450,6 +436,34 @@ public:
         return _per_fragment_instance_idx;
     }
 
+    ReservationTracker* instance_buffer_reservation() {
+        return _instance_buffer_reservation.get();
+    }
+
+    int64_t min_reservation() {
+        return _query_options.min_reservation;
+    }
+
+    int64_t max_reservation() {
+        return _query_options.max_reservation;
+    } 
+
+    bool disable_stream_preaggregations() {
+        return _query_options.disable_stream_preaggregations;
+    }
+
+     // the following getters are only valid after Prepare()
+    InitialReservations* initial_reservations() const { 
+        return _initial_reservations; 
+    }
+
+    ReservationTracker* buffer_reservation() const { 
+        return _buffer_reservation; 
+    }
+
+    /// Helper to call QueryState::StartSpilling().
+    Status StartSpilling(MemTracker* mem_tracker);
+
 private:
     // Allow TestEnv to set block_mgr manually for testing.
     friend class TestEnv;
@@ -465,7 +479,7 @@ private:
 
     Status create_error_log_file();
 
-    static const int DEFAULT_BATCH_SIZE = 1024;
+    static const int DEFAULT_BATCH_SIZE = 2048;
 
     DescriptorTbl* _desc_tbl;
     std::shared_ptr _obj_pool;
@@ -533,7 +547,7 @@ private:
     // will not necessarily be set in all error cases.
     boost::mutex _process_status_lock;
     Status _process_status;
-    boost::scoped_ptr _udf_pool;
+    //boost::scoped_ptr _udf_pool;
 
     // BufferedBlockMgr object used to allocate and manage blocks of input data in memory
     // with a fixed memory budget.
@@ -570,6 +584,26 @@ private:
     std::ofstream* _error_log_file; // error file path, absolute path
     std::unique_ptr _error_hub;
 
+    //TODO chenhao , remove this to QueryState 
+    /// Pool of buffer reservations used to distribute initial reservations to operators
+    /// in the query. Contains a ReservationTracker that is a child of
+    /// 'buffer_reservation_'. Owned by 'obj_pool_'. Set in Prepare().
+    ReservationTracker* _buffer_reservation = nullptr;
+
+    /// Buffer reservation for this fragment instance - a child of the query buffer
+    /// reservation. Non-NULL if 'query_state_' is not NULL.
+    boost::scoped_ptr _instance_buffer_reservation;
+
+    /// Pool of buffer reservations used to distribute initial reservations to operators
+    /// in the query. Contains a ReservationTracker that is a child of
+    /// 'buffer_reservation_'. Owned by 'obj_pool_'. Set in Prepare().
+    InitialReservations* _initial_reservations = nullptr;
+
+    /// Number of fragment instances executing, which may need to claim
+    /// from 'initial_reservations_'.
+    /// TODO: not needed if we call ReleaseResources() in a timely manner (IMPALA-1575).
+    AtomicInt32 _initial_reservation_refcnt;
+
     // prohibit copies
     RuntimeState(const RuntimeState&);
 };
diff --git a/be/src/runtime/test_env.cc b/be/src/runtime/test_env.cc
index a9a7712a06..791f382b1f 100644
--- a/be/src/runtime/test_env.cc
+++ b/be/src/runtime/test_env.cc
@@ -26,12 +26,12 @@ using boost::shared_ptr;
 
 namespace palo {
 
-boost::scoped_ptr TestEnv::_s_static_metrics;
+boost::scoped_ptr TestEnv::_s_static_metrics;
 
 TestEnv::TestEnv() {
     if (_s_static_metrics == NULL) {
-        _s_static_metrics.reset(new MetricGroup("test_env"));
-        PaloMetrics::create_metrics(_s_static_metrics.get());
+        _s_static_metrics.reset(new MetricRegistry("test_env"));
+        // PaloMetrics::create_metrics(_s_static_metrics.get());
     }
     _exec_env.reset(new ExecEnv);
     _exec_env->init_for_tests();
@@ -44,7 +44,7 @@ TestEnv::TestEnv() {
 }
 
 void TestEnv::init_metrics() {
-    _metrics.reset(new MetricGroup("test_env"));
+    _metrics.reset(new MetricRegistry("test_env"));
 }
 
 void TestEnv::init_tmp_file_mgr(const std::vector& tmp_dirs,
diff --git a/be/src/runtime/test_env.h b/be/src/runtime/test_env.h
index f895328242..8c911a7d31 100644
--- a/be/src/runtime/test_env.h
+++ b/be/src/runtime/test_env.h
@@ -65,7 +65,7 @@ public:
     MemTracker* io_mgr_tracker() {
         return _io_mgr_tracker.get();
     }
-    MetricGroup* metrics() {
+    MetricRegistry* metrics() {
         return _metrics.get();
     }
     TmpFileMgr* tmp_file_mgr() {
@@ -81,11 +81,11 @@ private:
     RuntimeState* create_runtime_state(int64_t query_id);
 
     // Global state for test environment.
-    static boost::scoped_ptr _s_static_metrics;
+    static boost::scoped_ptr _s_static_metrics;
     boost::scoped_ptr _exec_env;
     boost::scoped_ptr _block_mgr_parent_tracker;
     boost::scoped_ptr _io_mgr_tracker;
-    boost::scoped_ptr _metrics;
+    boost::scoped_ptr _metrics;
     boost::scoped_ptr _tmp_file_mgr;
 
     // Per-query states with associated block managers.
diff --git a/be/src/runtime/tmp_file_mgr.cc b/be/src/runtime/tmp_file_mgr.cc
index 705b70e478..bb24adc1d7 100644
--- a/be/src/runtime/tmp_file_mgr.cc
+++ b/be/src/runtime/tmp_file_mgr.cc
@@ -55,10 +55,10 @@ const std::string TMP_FILE_MGR_ACTIVE_SCRATCH_DIRS = "tmp_file_mgr.active_scratc
 const std::string TMP_FILE_MGR_ACTIVE_SCRATCH_DIRS_LIST = "tmp_file_mgr.active_scratch_dirs.list";
 
 TmpFileMgr::TmpFileMgr() :
-        _initialized(false), _dir_status_lock(), _tmp_dirs(),
-        _num_active_scratch_dirs_metric(NULL), _active_scratch_dirs_metric(NULL) {}
+        _initialized(false), _dir_status_lock(), _tmp_dirs() { }
+        // _num_active_scratch_dirs_metric(NULL), _active_scratch_dirs_metric(NULL) {}
 
-Status TmpFileMgr::init(MetricGroup* metrics) {
+Status TmpFileMgr::init(MetricRegistry* metrics) {
     std::string tmp_dirs_spec = config::storage_root_path;
     vector all_tmp_dirs;
     // Empty string should be interpreted as no scratch
@@ -71,7 +71,7 @@ Status TmpFileMgr::init(MetricGroup* metrics) {
 }
 
 Status TmpFileMgr::init_custom(
-        const vector& tmp_dirs, bool one_dir_per_device, MetricGroup* metrics) {
+        const vector& tmp_dirs, bool one_dir_per_device, MetricRegistry* metrics) {
     DCHECK(!_initialized);
     if (tmp_dirs.empty()) {
         LOG(WARNING) << "Running without spill to disk: no scratch directories provided.";
@@ -123,17 +123,18 @@ Status TmpFileMgr::init_custom(
     }
 
     DCHECK(metrics != NULL);
-    _num_active_scratch_dirs_metric = metrics->AddGauge(
-            TMP_FILE_MGR_ACTIVE_SCRATCH_DIRS, 0L);
+    _num_active_scratch_dirs_metric.reset(new IntGauge());
+    metrics->register_metric("active_scratch_dirs", _num_active_scratch_dirs_metric.get());
     //_active_scratch_dirs_metric = metrics->register_metric(new SetMetric(
     //        TMP_FILE_MGR_ACTIVE_SCRATCH_DIRS_LIST,
     //        std::set()));
-    _active_scratch_dirs_metric = SetMetric::CreateAndRegister(
-    metrics, TMP_FILE_MGR_ACTIVE_SCRATCH_DIRS_LIST, std::set());
-    _num_active_scratch_dirs_metric->update(_tmp_dirs.size());
-    for (int i = 0; i < _tmp_dirs.size(); ++i) {
-        _active_scratch_dirs_metric->add(_tmp_dirs[i].path());
-    }
+    // TODO(zc):
+    // _active_scratch_dirs_metric = SetMetric::CreateAndRegister(
+    // metrics, TMP_FILE_MGR_ACTIVE_SCRATCH_DIRS_LIST, std::set());
+    _num_active_scratch_dirs_metric->set_value(_tmp_dirs.size());
+    // for (int i = 0; i < _tmp_dirs.size(); ++i) {
+    //     _active_scratch_dirs_metric->add(_tmp_dirs[i].path());
+    // }
 
     _initialized = true;
 
@@ -186,7 +187,7 @@ void TmpFileMgr::blacklist_device(DeviceId device_id) {
     }
     if (added) {
         _num_active_scratch_dirs_metric->increment(-1);
-        _active_scratch_dirs_metric->remove(_tmp_dirs[device_id].path());
+        // _active_scratch_dirs_metric->remove(_tmp_dirs[device_id].path());
     }
 }
 
diff --git a/be/src/runtime/tmp_file_mgr.h b/be/src/runtime/tmp_file_mgr.h
index a270005967..69fe4479a8 100644
--- a/be/src/runtime/tmp_file_mgr.h
+++ b/be/src/runtime/tmp_file_mgr.h
@@ -23,12 +23,14 @@
 
 #include "common/status.h"
 #include "gen_cpp/Types_types.h"  // for TUniqueId
-//#include "util/non_primitive_metrics.hpp"
-#include "util/collection_metrics.h"
+// #include "util/collection_metrics.h"
 #include "util/spinlock.h"
+#include "util/metrics.h"
 
 namespace palo {
 
+class MetricRegistry;
+
 // TmpFileMgr creates and manages temporary files and directories on the local
 // filesystem. It can manage multiple temporary directories across multiple devices.
 // TmpFileMgr ensures that at most one directory per device is used unless overridden
@@ -122,7 +124,7 @@ public:
 
     // Creates the configured tmp directories. If multiple directories are specified per
     // disk, only one is created and used. Must be called after DiskInfo::Init().
-    Status init(MetricGroup* metrics);
+    Status init(MetricRegistry* metrics);
 
     // Custom initialization - initializes with the provided list of directories.
     // If one_dir_per_device is true, only use one temporary directory per device.
@@ -130,7 +132,7 @@ public:
     Status init_custom(
             const std::vector& tmp_dirs,
             bool one_dir_per_device,
-            MetricGroup* metrics);
+            MetricRegistry* metrics);
 
     // Return a new File handle with a unique path for a query instance. The file path
     // is within the (single) tmp directory on the specified device id. The caller owns
@@ -193,9 +195,9 @@ private:
     // The created tmp directories.
     std::vector _tmp_dirs;
 
-    // MetricGroup to track active scratch directories.
-    IntGauge* _num_active_scratch_dirs_metric;
-    SetMetric* _active_scratch_dirs_metric;
+    // MetricRegistry to track active scratch directories.
+    std::unique_ptr _num_active_scratch_dirs_metric;
+    // SetMetric* _active_scratch_dirs_metric;
 };
 
 } // end namespace palo
diff --git a/be/src/runtime/tuple.h b/be/src/runtime/tuple.h
index 1a9a1d4e07..88193968c3 100644
--- a/be/src/runtime/tuple.h
+++ b/be/src/runtime/tuple.h
@@ -172,11 +172,6 @@ public:
         return reinterpret_cast(reinterpret_cast(this) + offset);
     }
 
-    __int128* get_large_int_slot(int offset) {
-        DCHECK(offset != -1);  // -1 offset indicates non-materialized slot
-        return reinterpret_cast<__int128*>(reinterpret_cast(this) + offset);
-    }
-
     // For C++/IR interop, we need to be able to look up types by name.
     static const char* _s_llvm_class_name;
 
diff --git a/be/src/runtime/tuple_row.h b/be/src/runtime/tuple_row.h
index 7937db7b34..5d0787d221 100644
--- a/be/src/runtime/tuple_row.h
+++ b/be/src/runtime/tuple_row.h
@@ -23,7 +23,6 @@
 
 #include "runtime/descriptors.h"
 #include "runtime/mem_pool.h"
-#include "runtime/row_batch.h"
 #include "runtime/tuple.h"
 
 namespace palo {
diff --git a/be/src/runtime/types.h b/be/src/runtime/types.h
index 09526c97ea..d4c7087dd7 100644
--- a/be/src/runtime/types.h
+++ b/be/src/runtime/types.h
@@ -28,7 +28,7 @@
 #include "runtime/primitive_type.h"
 #include "thrift/protocol/TDebugProtocol.h"
 #include "common/config.h"
-#include "olap/field.h"
+#include "olap/hll.h"
 
 namespace llvm {
 class ConstantStruct;
diff --git a/be/src/runtime/vectorized_row_batch.cpp b/be/src/runtime/vectorized_row_batch.cpp
index 2befc861e2..68938ecc4a 100644
--- a/be/src/runtime/vectorized_row_batch.cpp
+++ b/be/src/runtime/vectorized_row_batch.cpp
@@ -14,216 +14,125 @@
 // under the License.
 
 #include "runtime/vectorized_row_batch.h"
+
 #include "common/logging.h"
+#include "olap/row_block.h"
 
 namespace palo {
 
-//VectorizedRowBatch::VectorizedRowBatch(const TupleDescriptor& tuple_desc, int capacity)
 VectorizedRowBatch::VectorizedRowBatch(
-        const std::vector& schema, int capacity)
-    : _schema(schema), _capacity(capacity), _num_cols(schema.size()) {
+        const std::vector& schema,
+        const std::vector& cols,
+        int capacity)
+            : _schema(schema), _cols(cols), _capacity(capacity), _limit(capacity) {
     _selected_in_use = false;
     _size = 0;
 
-    _mem_tracker.reset(new MemTracker(-1));
-    _mem_pool.reset(new MemPool(_mem_tracker.get()));
+    _tracker.reset(new MemTracker(-1));
+    _mem_pool.reset(new MemPool(_tracker.get()));
 
-    _row_iter = 0;
-    _has_backup = false;
-    _selected = reinterpret_cast(_mem_pool->allocate(sizeof(int) * _capacity));
-
-    for (int i = 0; i < _num_cols; ++i) {
-        boost::shared_ptr col_vec(new ColumnVector(_capacity));
-        _columns.push_back(col_vec);
+    _selected = reinterpret_cast(new char[sizeof(uint16_t) * _capacity]);
+    for (int i = 0; i < schema.size(); ++i) {
+        _vectors.push_back(new ColumnVector());
     }
 }
 
-bool VectorizedRowBatch::get_next_tuple(Tuple* tuple, const TupleDescriptor& tuple_desc) {
-    if (_row_iter < _size) {
-        std::vector slots = tuple_desc.slots();
-        if (_selected_in_use) {
-            for (int i = 0; i < slots.size(); ++i) {
-                void* slot = tuple->get_slot(slots[i]->tuple_offset());
-                memory_copy(slot,
-                        reinterpret_cast(_columns[i]->col_data())
-                        + get_slot_size(slots[i]->type().type) * _selected[_row_iter],
-                        get_slot_size(slots[i]->type().type));
-            }
-        } else {
-            for (int i = 0; i < slots.size(); ++i) {
-                void* slot = tuple->get_slot(slots[i]->tuple_offset());
-                memory_copy(slot,
-                        reinterpret_cast(_columns[i]->col_data())
-                        + get_slot_size(slots[i]->type().type) * _row_iter,
-                        get_slot_size(slots[i]->type().type));
-            }
-        }
-        ++_row_iter;
-        return true;
-    } else {
-        return false;
-    }
-}
-
-void VectorizedRowBatch::to_row_batch(RowBatch* row_batch, const TupleDescriptor& tuple_desc) {
-    const std::vector slots = tuple_desc.slots();
-    int row_remain = row_batch->capacity() - row_batch->num_rows();
-    int size = std::min(row_remain, _size - _row_iter);
-
-    if (size <= 0) {
-        return;
-    }
-
-    int row_index = row_batch->add_rows(size);
-    DCHECK(row_index != RowBatch::INVALID_ROW_INDEX);
-    uint8_t* tuple_buf = row_batch->tuple_data_pool()->allocate(
-                             size * tuple_desc.byte_size());
-    bzero(tuple_buf, size * tuple_desc.byte_size());
-    Tuple* tuple = reinterpret_cast(tuple_buf);
-
+void VectorizedRowBatch::dump_to_row_block(RowBlock* row_block) {
     if (_selected_in_use) {
-        for (int i = _row_iter; i < _row_iter + size; ++i) {
-            for (int j = 0; j < slots.size(); ++j) {
-                // TODO(hujie01) bad code need optimize
-                if (slots[j]->type().is_string_type()) {
-                    StringValue* src = reinterpret_cast(
-                                           reinterpret_cast(_columns[j]->col_data())
-                                           + slots[j]->type().get_slot_size() * _selected[i]);
-                    uint8_t* v = row_batch->tuple_data_pool()->allocate(src->len);
-                    memory_copy(v, src->ptr, src->len);
-                    // if src->len == 0 then dst->ptr = NULL
-                    StringValue* slot = tuple->get_string_slot(slots[j]->tuple_offset());
-                    slot->ptr = reinterpret_cast(v);
-                    slot->len = src->len;
-                } else {
-                    void* slot = tuple->get_slot(slots[j]->tuple_offset());
-                    memory_copy(slot,
-                                reinterpret_cast(_columns[j]->col_data())
-                                + slots[j]->type().get_slot_size() * _selected[i],
-                                slots[j]->type().get_slot_size());
+        for (auto column_id : _cols) {
+            bool no_nulls = _vectors[column_id]->no_nulls();
+            // pointer of this field's vector
+            char* vec_field_ptr = (char*)_vectors[column_id]->col_data();
+            // pointer of this field in row block
+            char* row_field_ptr =
+                row_block->_mem_buf + row_block->_field_offset_in_memory[column_id];
+            const FieldInfo& field_info = _schema[column_id];
+            size_t field_size = 0;
+            if (field_info.type == OLAP_FIELD_TYPE_CHAR ||
+                field_info.type == OLAP_FIELD_TYPE_VARCHAR ||
+                field_info.type == OLAP_FIELD_TYPE_HLL) {
+                field_size = sizeof(StringSlice);
+            } else {
+                field_size = field_info.length;
+            }
+            if (no_nulls) {
+                for (int row = 0; row < _size; ++row) {
+                    char* vec_field =
+                        vec_field_ptr + _selected[row] * field_size;
+                    // Set not null
+                    *row_field_ptr = 0;
+                    memory_copy(row_field_ptr + 1, vec_field, field_size);
+
+                    // point to next row
+                    row_field_ptr += row_block->_mem_row_bytes;
+                }
+            } else {
+                bool* is_null = _vectors[column_id]->is_null();
+                for (int row = 0; row < _size; ++row) {
+                    if (is_null[_selected[row]]) {
+                        *row_field_ptr = 1;
+                    } else {
+                        char* vec_field =
+                            vec_field_ptr + _selected[row] * field_size;
+                        // Set not null
+                        *row_field_ptr = 0;
+                        memory_copy(row_field_ptr + 1, vec_field, field_size);
+                    }
+                    row_field_ptr += row_block->_mem_row_bytes;
                 }
             }
-
-            TupleRow* row = row_batch->get_row(row_index++);
-            row->set_tuple(0, tuple);
-            tuple = reinterpret_cast(reinterpret_cast(tuple) +
-                                             tuple_desc.byte_size());
         }
     } else {
-        for (int i = _row_iter; i < _row_iter + size; ++i) {
-            for (int j = 0; j < slots.size(); ++j) {
-                // TODO(hujie01) bad code need optimize
-                if (slots[j]->type().is_string_type()) {
-                    StringValue* slot = tuple->get_string_slot(slots[j]->tuple_offset());
-                    StringValue* src = reinterpret_cast(
-                                           reinterpret_cast(_columns[j]->col_data())
-                                           + slots[j]->type().get_slot_size() * i);
-                    uint8_t* v = row_batch->tuple_data_pool()->allocate(src->len);
-                    memory_copy(v, src->ptr, src->len);
-                    // if src->len == 0 then dst->ptr = NULL
-                    slot->ptr = reinterpret_cast(v);
-                    slot->len = src->len;
-                } else {
-                    void* slot = tuple->get_slot(slots[j]->tuple_offset());
-                    memory_copy(slot,
-                                reinterpret_cast(_columns[j]->col_data())
-                                + slots[j]->type().get_slot_size() * i,
-                                slots[j]->type().get_slot_size());
+        for (auto column_id : _cols) {
+            bool no_nulls = _vectors[column_id]->no_nulls();
+
+            char* vec_field_ptr = (char*)_vectors[column_id]->col_data();
+            char* row_field_ptr =
+                row_block->_mem_buf + row_block->_field_offset_in_memory[column_id];
+            const FieldInfo& field_info = _schema[column_id];
+
+            size_t field_size = 0;
+            if (field_info.type == OLAP_FIELD_TYPE_CHAR ||
+                field_info.type == OLAP_FIELD_TYPE_VARCHAR ||
+                field_info.type == OLAP_FIELD_TYPE_HLL) {
+                field_size = sizeof(StringSlice);
+            } else {
+                field_size = field_info.length;
+            }
+
+            if (no_nulls) {
+                for (int row = 0; row < _size; ++row) {
+                    char* vec_field = vec_field_ptr;
+                    // Set not null
+                    *row_field_ptr = 0;
+                    memory_copy(row_field_ptr + 1, vec_field, field_size);
+                    row_field_ptr += row_block->_mem_row_bytes;
+                    vec_field_ptr += field_size;
+                }
+            } else {
+                bool* is_null = _vectors[column_id]->is_null();
+                for (int row = 0; row < _size; ++row) {
+                    if (is_null[row]) {
+                        *row_field_ptr = 1;
+                    } else {
+                        char* vec_field = vec_field_ptr;
+                        // Set not null
+                        *row_field_ptr = 0;
+                        memory_copy(row_field_ptr + 1, vec_field, field_size);
+                    }
+                    row_field_ptr += row_block->_mem_row_bytes;
+                    vec_field_ptr += field_size;
                 }
             }
-
-            TupleRow* row = row_batch->get_row(row_index++);
-            row->set_tuple(0, tuple);
-            tuple = reinterpret_cast(reinterpret_cast(tuple) +
-                                             tuple_desc.byte_size());
         }
     }
 
-    _row_iter += size;
-    row_batch->commit_rows(size);
+    row_block->_pos = 0;
+    row_block->_limit = _size;
+    row_block->_info.row_num = _size;
+    row_block->_block_status = _block_status;
+    row_block->mem_pool()->free_all();
+    row_block->mem_pool()->acquire_data(_mem_pool.get(), false);
 }
 
-#if 0
-void VectorizedRowBatch::reorganized_from_pax(
-        RowBlock
-        const std::vector& schema) {
-    for (int i = 0, j = 0; i < _num_cols && j < schema.size(); ++i) {
-        if (_schema[i].unique_id != schema[j].unique_id
-                || column(i)->col_data() != NULL) {
-            continue;
-        }
-        ++j;
-
-        switch (_schema[i].type) {
-        case OLAP_FIELD_TYPE_STRING: {
-            StringValue* value = reinterpret_cast(
-                                     _mem_pool.allocate(get_slot_size(TYPE_VARCHAR) * _size));
-            int len = column(i)->byte_size() / _size;
-            char* raw = reinterpret_cast(column(i)->col_data());
-
-            for (int j = 0; j < _size; ++j) {
-                value[j].ptr = raw + len * j;
-                value[j].len = strnlen(value[j].ptr, len);
-            }
-
-            column(i)->set_col_data(value);
-            break;
-        }
-        case OLAP_FIELD_TYPE_VARCHAR:
-             OLAP_FIELD_TYPE_HLL :{
-            typedef uint32_t OffsetValueType;
-            typedef uint16_t LengthValueType;
-            DCHECK_EQ(sizeof(OffsetValueType) * _size, column(i)->byte_size());
-            StringValue* value = reinterpret_cast(
-                                     _mem_pool.allocate(get_slot_size(TYPE_VARCHAR) * _size));
-            OffsetValueType* offsets = reinterpret_cast(
-                                           column(i)->col_data());
-            char* raw = reinterpret_cast(column(i)->col_string_data());
-
-            for (int j = 0; j < _size; ++j) {
-                value[j].len = *reinterpret_cast(raw + offsets[j]);
-                value[j].ptr = raw + offsets[j] + sizeof(LengthValueType);
-            }
-
-            column(i)->set_col_data(value);
-            break;
-        }
-        case OLAP_FIELD_TYPE_DATE: {
-            uint8_t* value = reinterpret_cast(
-                                 _mem_pool.allocate(get_slot_size(TYPE_DATE) * _size));
-            uint8_t* raw = reinterpret_cast(column(i)->col_data());
-
-            for (int j = 0; j < _size; ++j) {
-                new(value + j * get_slot_size(TYPE_DATE))
-                TimestampValue(raw + j * 3, OLAP_DATETIME);
-            }
-
-            column(i)->set_col_data(value);
-            break;
-        }
-        case OLAP_FIELD_TYPE_DATETIME: {
-            uint8_t* value = reinterpret_cast(
-                                 _mem_pool.allocate(get_slot_size(TYPE_DATETIME) * _size));
-            uint8_t* raw = reinterpret_cast(column(i)->col_data());
-
-            for (int j = 0; j < _size; ++j) {
-                new(value + j * get_slot_size(TYPE_DATETIME))
-                TimestampValue(raw + j * 8, OLAP_DATE);
-            }
-
-            column(i)->set_col_data(value);
-            break;
-        }
-        default:
-            break;
-        }
-    }
-}
-#endif
-
-//void VectorizedRowBatch::reorganized_from_dsm() {
-
-//}
-}
-
-/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
+} // namespace palo
diff --git a/be/src/runtime/vectorized_row_batch.h b/be/src/runtime/vectorized_row_batch.h
index 6b3961cdba..1c830c23a0 100644
--- a/be/src/runtime/vectorized_row_batch.h
+++ b/be/src/runtime/vectorized_row_batch.h
@@ -28,33 +28,32 @@
 #include "runtime/row_batch_interface.hpp"
 #include "runtime/row_batch.h"
 #include "util/mem_util.hpp"
+#include "olap/row_cursor.h"
 
 namespace palo {
 
 class VectorizedRowBatch;
-
-struct BackupInfo {
-    BackupInfo() : selected_in_use(false), size(0), selected(NULL) {}
-
-    bool selected_in_use;
-    int size;
-    int* selected;
-};
+class RowBlock;
 
 class ColumnVector {
 public:
-    virtual ~ColumnVector() {
-        //if (NULL == _is_null) {
-            //delete _is_null;
-        //}
+    ColumnVector() { }
+    ~ColumnVector() {}
+
+    bool* is_null() const {
+        return _is_null;
     }
 
-    inline bool is_repeating() {
-        return _is_repeating;
+    void set_is_null(bool* is_null) {
+        _is_null = is_null;
     }
 
-    void set_is_repeating(bool is_repeating) {
-        _is_repeating = is_repeating;
+    bool no_nulls() const {
+        return _no_nulls;
+    }
+
+    void set_no_nulls(bool no_nulls) {
+        _no_nulls = no_nulls;
     }
 
     void* col_data() {
@@ -63,71 +62,40 @@ public:
     void set_col_data(void* data) {
         _col_data = data;
     }
-
-    void* col_string_data() {
-        return _col_string_data;
-    }
-    void set_col_string_data(void* data) {
-        _col_string_data = data;
-    }
-
-    int byte_size() {
-        return _byte_size;
-    }
-    void set_byte_size(int byte_size) {
-        _byte_size = byte_size;
-    }
 private:
-    ColumnVector(int size) {
-        _is_repeating = false;
-        //_no_nulls = true;
-        //_is_null = new bool[size];
-        _col_data = NULL;
-        _col_string_data = NULL;
-        _byte_size = 0;
-    }
-    friend class VectorizedRowBatch;
-    void* _col_data;
-    void* _col_string_data;
-    int _byte_size;
-    bool _is_repeating;
-    // this is no null in palo now
-    //bool _no_nulls;
-    //bool* _is_null;
+    void* _col_data = nullptr;
+    bool _no_nulls = false;
+    bool* _is_null = nullptr;
 };
 
-class VectorizedRowBatch : public RowBatchInterface {
+class VectorizedRowBatch {
 public:
-    //VectorizedRowBatch(const TupleDescriptor& tuple_desc, int capacity);
-    VectorizedRowBatch(const std::vector& schema, int capacity);
-    virtual ~VectorizedRowBatch() { }
+    VectorizedRowBatch(
+        const std::vector& schema,
+        const std::vector& cols,
+        int capacity);
+    ~VectorizedRowBatch() {
+        for (auto col_vec : _vectors) {
+            delete col_vec;
+        }
+        delete[] _selected;
+    }
 
     MemPool* mem_pool() {
         return _mem_pool.get();
     }
 
-    void add_column(int index, const TypeDescriptor& type) {
-        if (-1 == index) {
-            return;
-        }
-
-        DCHECK_EQ(index, _columns.size());
-        boost::shared_ptr col_vec(new ColumnVector(_capacity));
-        col_vec->set_col_data(_mem_pool->allocate(type.get_slot_size() * _capacity));
-        _columns.push_back(col_vec);
-    }
-
     ColumnVector* column(int column_index) {
-        DCHECK_GE(column_index, 0);
-        DCHECK_LT(column_index, _columns.size());
-        return _columns[column_index].get();
+        return _vectors[column_index];
     }
 
-    int capacity() {
+    const std::vector& columns() const { return _cols; }
+
+    uint16_t capacity() {
         return _capacity;
     }
 
-    int size() {
+    uint16_t size() {
         return _size;
     }
 
@@ -148,85 +116,39 @@ public:
         _selected_in_use = selected_in_use;
     }
 
-    int* selected() const {
+    uint16_t* selected() const {
         return _selected;
     }
 
-    void set_selected(int* selected) {
-        for (int i = 0; i < _capacity; ++i) {
-            _selected[i] = selected[i];
-        }
-    }
-
-    inline void backup() {
-        _backup_info.size = _size;
-        _backup_info.selected_in_use = _selected_in_use;
-        if (_selected_in_use) {
-            if (NULL == _backup_info.selected) {
-                _backup_info.selected
-                    = reinterpret_cast(_mem_pool->allocate(sizeof(int) * _capacity));
-            }
-            for (int i = 0; i < _capacity; ++i) {
-                _backup_info.selected[i] = _selected[i];
-            }
-        }
-        _has_backup = true;
-    }
-
-    inline void restore() {
-        if (_has_backup) {
-            _size = _backup_info.size;
-            _selected_in_use = _backup_info.selected_in_use;
-            if (_selected_in_use) {
-                _selected = _backup_info.selected;
-            }
-            _row_iter = 0;
-        }
-    }
-
-    //// reorganized memory layout from PAX storage
-    //void reorganized_from_pax(const std::vector& schema);
-
-    //// reorganized memory layout from DSM storage(Column Store)
-    //void reorganized_from_dsm();
-
-    inline bool is_iterator_end() {
-        return _row_iter >= _size;
-    }
-
-    inline void reset_row_iterator() {
-        _row_iter = 0;
-    }
-
-    inline void reset() {
+    inline void clear() {
         _size = 0;
         _selected_in_use = false;
-        _row_iter = 0;
-        _columns.erase(_columns.begin() + _num_cols, _columns.end());
+        _limit = _capacity;
         _mem_pool->clear();
-        _selected = reinterpret_cast(_mem_pool->allocate(sizeof(int) * _capacity));
     }
 
-    bool get_next_tuple(Tuple* tuple, const TupleDescriptor& tuple_desc);
+    uint16_t limit() const { return _limit; }
+    void set_limit(uint16_t limit) { _limit = limit; }
+    void set_block_status(uint8_t status) { _block_status = status; }
+    uint8_t block_status() const { return _block_status; }
 
-    void to_row_batch(RowBatch* row_batch, const TupleDescriptor& tuple_desc);
+    // Dump this vector batch to RowBlock;
+    void dump_to_row_block(RowBlock* row_block);
 
 private:
-    //const TupleDescriptor& _tuple_desc;
-    std::vector _schema;
-    Tuple* _tuple;
-    const int _capacity;
-    const int _num_cols;
-    int _size;
-    int* _selected;
-    bool _selected_in_use;
-    int _row_iter;
-    BackupInfo _backup_info;
-    bool _has_backup;
-    std::vector > _columns;
+    const std::vector& _schema;
+    const std::vector& _cols;
+    const uint16_t _capacity;
+    uint16_t _size = 0;
+    uint16_t* _selected = nullptr;
+    std::vector _vectors;
 
-    std::unique_ptr _mem_tracker;
+    bool _selected_in_use = false;
+    uint8_t _block_status;
+
+    std::unique_ptr _tracker;
     std::unique_ptr _mem_pool;
+    uint16_t _limit;
 };
 
 }
diff --git a/be/src/service/CMakeLists.txt b/be/src/service/CMakeLists.txt
index 453f7bd6db..dbefdcb559 100644
--- a/be/src/service/CMakeLists.txt
+++ b/be/src/service/CMakeLists.txt
@@ -22,6 +22,8 @@ set(EXECUTABLE_OUTPUT_PATH "${BUILD_DIR}/src/service")
 add_library(Service
     backend_options.cpp
     backend_service.cpp
+    brpc_service.cpp
+    internal_service.cpp
 )
 
 add_executable(palo_be
@@ -29,7 +31,7 @@ add_executable(palo_be
 )
 
 # This permits libraries loaded by dlopen to link to the symbols in the program.
-set_target_properties(palo_be PROPERTIES LINK_FLAGS -rdynamic)
+# set_target_properties(palo_be PROPERTIES LINK_FLAGS -pthread)
 
 target_link_libraries(palo_be
     ${PALO_LINK_LIBS}
diff --git a/be/src/service/backend_service.cpp b/be/src/service/backend_service.cpp
index 54fa546f32..deb2230c4d 100644
--- a/be/src/service/backend_service.cpp
+++ b/be/src/service/backend_service.cpp
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+
 #include "service/backend_options.h"
 #include "util/network_util.h"
 #include "util/thrift_util.h"
@@ -48,18 +49,15 @@ using apache::thrift::concurrency::PosixThreadFactory;
 BackendService::BackendService(ExecEnv* exec_env) :
         _exec_env(exec_env),
         _agent_server(new AgentServer(exec_env, *exec_env->master_info())) {
-#ifndef ADDRESS_SANITIZER
+#if !defined(ADDRESS_SANITIZER) && !defined(LEAK_SANITIZER) && !defined(THREAD_SANITIZER)
     // tcmalloc and address sanitizer can not be used together
     if (!config::heap_profile_dir.empty()) {
         HeapProfilerStart(config::heap_profile_dir.c_str());
     }
 #endif
-    // Initialize Palo metrics
-    PaloMetrics::create_metrics(exec_env->metrics());
     char buf[64];
     DateTimeValue value = DateTimeValue::local_time();
     value.to_string(buf);
-    PaloMetrics::palo_be_start_time()->update(buf);
 }
 
 Status BackendService::create_service(ExecEnv* exec_env, int port, ThriftServer** server) {
@@ -71,8 +69,9 @@ Status BackendService::create_service(ExecEnv* exec_env, int port, ThriftServer*
     boost::shared_ptr thread_factory(new PosixThreadFactory());
 
     boost::shared_ptr be_processor(new BackendServiceProcessor(handler));
-    *server = new ThriftServer("PaloBackend",
-                               be_processor, port,
+    *server = new ThriftServer("backend",
+                               be_processor,
+                               port,
                                exec_env->metrics(),
                                config::be_service_threads);
 
diff --git a/be/src/service/backend_service.h b/be/src/service/backend_service.h
index 677e2ddcbe..697a7bf1dc 100644
--- a/be/src/service/backend_service.h
+++ b/be/src/service/backend_service.h
@@ -22,9 +22,9 @@
 #define BDG_PALO_BE_SERVICE_BACKEND_SERVICE_H
 
 #include 
+#include "agent/agent_server.h"
 #include "common/status.h"
 #include "gen_cpp/BackendService.h"
-#include "agent/agent_server.h"
 #include 
 
 namespace palo {
diff --git a/be/src/service/brpc.h b/be/src/service/brpc.h
new file mode 100644
index 0000000000..de56c0790b
--- /dev/null
+++ b/be/src/service/brpc.h
@@ -0,0 +1,54 @@
+// Copyright (c) 2018, Baidu.com, Inc. All Rights Reserved
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+// This file is used to fixed macro conflict between butil and gutil
+// all header need by brpc is contain in this file.
+// include this file instead of include 
+// and this file must put the first include in soure file
+
+#include "gutil/macros.h"
+// Macros in the guti/macros.h, use butil's define
+#ifdef DISALLOW_IMPLICIT_CONSTRUCTORS
+#undef DISALLOW_IMPLICIT_CONSTRUCTORS
+#endif
+
+#ifdef arraysize
+#undef arraysize
+#endif
+
+#undef OVERRIDE
+#undef FINAL
+
+// use be/src/gutil/integral_types.h override butil/basictypes.h
+#include "gutil/integral_types.h"
+#ifdef BASE_INTEGRAL_TYPES_H_
+#define BUTIL_BASICTYPES_H_
+#endif
+
+#include "gutil/logging-inl.h"
+
+#ifdef DEBUG_MODE
+#undef DEBUG_MODE
+#endif
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
diff --git a/be/src/service/brpc_service.cpp b/be/src/service/brpc_service.cpp
new file mode 100644
index 0000000000..5d8ba779ef
--- /dev/null
+++ b/be/src/service/brpc_service.cpp
@@ -0,0 +1,48 @@
+// Copyright (c) 2018, Baidu.com, Inc. All Rights Reserved
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "service/brpc_service.h"
+
+#include 
+
+#include "common/logging.h"
+#include "service/brpc.h"
+#include "service/internal_service.h"
+
+namespace palo {
+
+BRpcService::BRpcService(ExecEnv* exec_env)
+        : _exec_env(exec_env),
+        _server(new brpc::Server()) {
+}
+
+BRpcService::~BRpcService() {
+}
+
+Status BRpcService::start(int port) {
+    // Add service
+    _server->AddService(new PInternalServiceImpl(_exec_env), brpc::SERVER_OWNS_SERVICE);
+    // start service
+    brpc::ServerOptions options;
+    if (_server->Start(port, &options) != 0) {
+        char buf[64];
+        LOG(WARNING) << "start brpc failed, errno=" << errno
+            << ", errmsg=" << strerror_r(errno, buf, 64) << ", port=" << port;
+        return Status("start brpc service failed");
+    }
+    return Status::OK;
+}
+
+}
diff --git a/be/src/service/brpc_service.h b/be/src/service/brpc_service.h
new file mode 100644
index 0000000000..72b319cc86
--- /dev/null
+++ b/be/src/service/brpc_service.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2018, Baidu.com, Inc. All Rights Reserved
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include 
+
+#include "common/status.h"
+
+namespace brpc {
+class Server;
+}
+
+namespace palo {
+
+class ExecEnv;
+
+// Class enclose brpc service
+class BRpcService {
+public:
+    BRpcService(ExecEnv* exec_env);
+    ~BRpcService();
+
+    Status start(int port);
+
+private:
+    ExecEnv* _exec_env;
+    std::unique_ptr _server;
+};
+
+}
diff --git a/be/src/service/internal_service.cpp b/be/src/service/internal_service.cpp
new file mode 100644
index 0000000000..ccfb56d44c
--- /dev/null
+++ b/be/src/service/internal_service.cpp
@@ -0,0 +1,55 @@
+// Copyright (c) 2018, Baidu.com, Inc. All Rights Reserved
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "service/internal_service.h"
+
+#include "runtime/exec_env.h"
+#include "runtime/data_stream_mgr.h"
+#include "service/brpc.h"
+
+namespace palo {
+
+PInternalServiceImpl::PInternalServiceImpl(ExecEnv* exec_env) : _exec_env(exec_env) {
+}
+
+PInternalServiceImpl::~PInternalServiceImpl() {
+}
+
+void PInternalServiceImpl::transmit_data(google::protobuf::RpcController* cntl_base,
+                                         const PTransmitDataParams* request,
+                                         PTransmitDataResult* response,
+                                         google::protobuf::Closure* done) {
+    bool eos = request->eos();
+    if (request->has_row_batch()) {
+        _exec_env->stream_mgr()->add_data(
+            request->finst_id(), request->node_id(),
+            request->row_batch(), request->sender_id(),
+            request->be_number(), request->packet_seq(),
+            eos ? nullptr : &done);
+    }
+    if (eos) {
+        TUniqueId finst_id;
+        finst_id.__set_hi(request->finst_id().hi());
+        finst_id.__set_lo(request->finst_id().lo());
+        _exec_env->stream_mgr()->close_sender(
+            finst_id, request->node_id(),
+            request->sender_id(), request->be_number());
+    }
+    if (done != nullptr) {
+        done->Run();
+    }
+}
+
+}
diff --git a/be/src/service/internal_service.h b/be/src/service/internal_service.h
new file mode 100644
index 0000000000..aa3ce2eae3
--- /dev/null
+++ b/be/src/service/internal_service.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2018, Baidu.com, Inc. All Rights Reserved
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "gen_cpp/internal_service.pb.h"
+
+namespace palo {
+
+class ExecEnv;
+
+class PInternalServiceImpl : public PInternalService {
+public:
+    PInternalServiceImpl(ExecEnv* exec_env);
+    virtual ~PInternalServiceImpl();
+
+    void transmit_data(::google::protobuf::RpcController* controller,
+                       const ::palo::PTransmitDataParams* request,
+                       ::palo::PTransmitDataResult* response,
+                       ::google::protobuf::Closure* done) override;
+private:
+    ExecEnv* _exec_env;
+};
+
+}
diff --git a/be/src/service/palo_main.cpp b/be/src/service/palo_main.cpp
index 19c6b69bc9..5831f39889 100644
--- a/be/src/service/palo_main.cpp
+++ b/be/src/service/palo_main.cpp
@@ -42,6 +42,7 @@
 #include "olap/olap_main.h"
 #include "service/backend_options.h"
 #include "service/backend_service.h"
+#include "service/brpc_service.h"
 #include 
 #include "common/resource_tls.h"
 #include "exec/schema_scanner/frontend_helper.h"
@@ -80,12 +81,6 @@ int main(int argc, char** argv) {
         exit(-1);
     }
 
-    int lock_res = flock(fd, LOCK_EX | LOCK_NB);
-    if (lock_res < 0) {
-        fprintf(stderr, "fail to lock pid file, maybe another process is locking it.");
-        exit(-1);
-    }
-
     string pid = std::to_string((long)getpid());
     pid += "\n";
     size_t length = write(fd, pid.c_str(), pid.size());
@@ -94,12 +89,22 @@ int main(int argc, char** argv) {
         exit(-1);
     }
 
+    // descriptor will be leaked when failing to close fd
+    if (::close(fd) < 0) {
+        fprintf(stderr, "failed to close fd of pidfile.");
+        exit(-1);
+    }
+
     string conffile = string(getenv("PALO_HOME")) + "/conf/be.conf";
     if (!palo::config::init(conffile.c_str(), false)) {
         fprintf(stderr, "error read config file. \n");
         return -1;
     }
 
+#if !defined(ADDRESS_SANITIZER) && !defined(LEAK_SANITIZER) && !defined(THREAD_SANITIZER)
+    MallocExtension::instance()->SetNumericProperty(
+        "tcmalloc.aggressive_memory_decommit", 21474836480);
+#endif
     palo::LlvmCodeGen::initialize_llvm();
     palo::init_daemon(argc, argv);
 
@@ -138,6 +143,14 @@ int main(int argc, char** argv) {
         exit(1);
     }
 
+    palo::BRpcService brpc_service(&exec_env);
+    status = brpc_service.start(palo::config::brpc_port);
+    if (!status.ok()) {
+        LOG(ERROR) << "BRPC service did not start correctly, exiting";
+        palo::shutdown_logging();
+        exit(1);
+    }
+
     status = exec_env.start_services();
     if (!status.ok()) {
         LOG(ERROR) << "Palo Be services did not start correctly, exiting";
@@ -160,13 +173,13 @@ int main(int argc, char** argv) {
         palo::shutdown_logging();
         exit(1);
     }
-    heartbeat_thrift_server->start();
-
-    // this blocks until the beeswax and hs2 servers terminate
-    palo::PaloMetrics::palo_be_ready()->update(true);
-    LOG(INFO) << "Palo has started.";
-
-    //be_server->join();
+    
+    status = heartbeat_thrift_server->start();
+    if (!status.ok()) {
+        LOG(ERROR) << "Palo BE HeartBeat Service did not start correctly, exiting";
+        palo::shutdown_logging();
+        exit(1);
+    }
 
     palo::ReactorFactory::join();
 
diff --git a/be/src/testutil/desc_tbl_builder.h b/be/src/testutil/desc_tbl_builder.h
index 286ae3073c..e686433f30 100644
--- a/be/src/testutil/desc_tbl_builder.h
+++ b/be/src/testutil/desc_tbl_builder.h
@@ -23,6 +23,8 @@
 
 #include "runtime/runtime_state.h"
 
+#include "runtime/types.h"
+
 namespace palo {
 
 class ObjectPool;
diff --git a/be/src/util/CMakeLists.txt b/be/src/util/CMakeLists.txt
index aba2bb2606..3eb7bb6912 100644
--- a/be/src/util/CMakeLists.txt
+++ b/be/src/util/CMakeLists.txt
@@ -26,6 +26,7 @@ set(EXECUTABLE_OUTPUT_PATH "${BUILD_DIR}/src/util")
 
 add_library(Util STATIC
   bfd_parser.cpp
+  bitmap.cpp
   codec.cpp
   compress.cpp
   cpu_info.cpp
@@ -51,6 +52,7 @@ add_library(Util STATIC
   thrift_client.cpp
   thrift_server.cpp
   symbols_util.cpp
+  system_metrics.cpp
   url_parser.cpp
   url_coding.cpp
   file_utils.cpp
@@ -63,7 +65,12 @@ add_library(Util STATIC
   load_error_hub.cpp
   mysql_load_error_hub.cpp
   null_load_error_hub.cpp
+  time.cpp
+  os_info.cpp
+#  coding_util.cpp
   cidr.cpp
+  core_local.cpp
+  rpc_channel.cpp
 )
 
 #ADD_BE_TEST(integer-array-test)
diff --git a/be/src/util/aligned_new.h b/be/src/util/aligned_new.h
new file mode 100755
index 0000000000..ae9a08faed
--- /dev/null
+++ b/be/src/util/aligned_new.h
@@ -0,0 +1,60 @@
+// Modifications copyright (C) 2017, Baidu.com, Inc.
+// Copyright 2017 The Apache Software Foundation
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef BDG_PALO_BE_SRC_UTIL_ALIGNED_NEW_H_
+#define BDG_PALO_BE_SRC_UTIL_ALIGNED_NEW_H_
+
+#include 
+
+#include "common/compiler_util.h"
+#include "common/logging.h"
+
+namespace palo {
+
+// Objects that should be allocated, for performance or correctness reasons, at alignment
+// greater than that promised by the global new (16) can inherit publicly from AlignedNew.
+template 
+struct alignas(ALIGNMENT) AlignedNew {
+  static_assert(ALIGNMENT > 0, "ALIGNMENT must be positive");
+  static_assert((ALIGNMENT & (ALIGNMENT - 1)) == 0, "ALIGNMENT must be a power of 2");
+  static_assert(
+      (ALIGNMENT % sizeof(void*)) == 0, "ALIGNMENT must be a multiple of sizeof(void *)");
+  static void* operator new(std::size_t count) { return Allocate(count); }
+  static void* operator new[](std::size_t count) { return Allocate(count); }
+  static void operator delete(void* ptr) { free(ptr); }
+  static void operator delete[](void* ptr) { free(ptr); }
+
+ private:
+  static void* Allocate(std::size_t count) {
+    void* result = nullptr;
+    const auto alloc_failed = posix_memalign(&result, ALIGNMENT, count);
+    if (alloc_failed) {
+      LOG(ERROR) << "Failed to allocate aligned memory; return code " << alloc_failed;
+      throw std::bad_alloc();
+    }
+    DCHECK(result != nullptr);
+    return result;
+  }
+};
+
+using CacheLineAligned = AlignedNew;
+}
+
+#endif
diff --git a/be/src/util/bit_util.h b/be/src/util/bit_util.h
index 50e0b42925..8dc42992f7 100644
--- a/be/src/util/bit_util.h
+++ b/be/src/util/bit_util.h
@@ -25,6 +25,7 @@
 
 #include "common/compiler_util.h"
 #include "util/cpu_info.h"
+#include "gutil/bits.h"
 
 namespace palo {
 
@@ -208,6 +209,138 @@ public:
     }
 #endif
 
+  /// Returns the smallest power of two that contains v. If v is a power of two, v is
+  /// returned. Taken from
+  /// http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+  static inline int64_t RoundUpToPowerOfTwo(int64_t v) {
+    --v;
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16; 
+    v |= v >> 32; 
+    ++v;
+    return v;
+  }
+
+  // Wrap the gutil/ version for convenience.
+  static inline int Log2FloorNonZero64(uint64_t n) {
+    return Bits::Log2FloorNonZero64(n);
+  }
+
+  // Wrap the gutil/ version for convenience.
+  static inline int Log2Floor64(uint64_t n) {
+    return Bits::Log2Floor64(n);
+  }
+
+  static inline int Log2Ceiling64(uint64_t n) {
+    int floor = Log2Floor64(n);
+    // Check if zero or a power of two. This pattern is recognised by gcc and optimised
+    // into branch-free code.
+    if (0 == (n & (n - 1))) {
+      return floor;
+    } else {
+      return floor + 1;
+    }
+  }
+
+  static inline int Log2CeilingNonZero64(uint64_t n) {
+    int floor = Log2FloorNonZero64(n);
+    // Check if zero or a power of two. This pattern is recognised by gcc and optimised
+    // into branch-free code.
+    if (0 == (n & (n - 1))) {
+      return floor;
+    } else {
+      return floor + 1;
+    }
+  }
+  
+  // Returns the rounded up to 64 multiple. Used for conversions of bits to i64. 
+  static inline uint32_t round_up_numi_64(uint32_t bits) {
+    return (bits + 63) >> 6;
+  }
+
+  constexpr static inline int64_t Ceil(int64_t value, int64_t divisor) {
+    return value / divisor + (value % divisor != 0); 
+  }
+
+  constexpr static inline bool IsPowerOf2(int64_t value) {
+    return (value & (value - 1)) == 0;
+  }
+
+  constexpr static inline int64_t RoundDown(int64_t value, int64_t factor) {
+    return (value / factor) * factor;
+  }
+
+  /// Specialized round up and down functions for frequently used factors,
+  /// like 8 (bits->bytes), 32 (bits->i32), and 64 (bits->i64)
+  /// Returns the rounded up number of bytes that fit the number of bits.
+  constexpr static inline uint32_t RoundUpNumBytes(uint32_t bits) {
+    return (bits + 7) >> 3;
+  }
+
+  /// Non hw accelerated pop count.
+  /// TODO: we don't use this in any perf sensitive code paths currently.  There
+  /// might be a much faster way to implement this.
+  static inline int PopcountNoHw(uint64_t x) {
+    int count = 0;
+    for (; x != 0; ++count) x &= x-1;
+    return count;
+  }
+
+  /// Returns the number of set bits in x
+  static inline int Popcount(uint64_t x) {
+    //if (LIKELY(CpuInfo::is_supported(CpuInfo::POPCNT))) {
+    //  return POPCNT_popcnt_u64(x);
+    //} else {
+    return PopcountNoHw(x);
+   // }
+  }
+
+  // Compute correct population count for various-width signed integers
+  template
+  static inline int PopcountSigned(T v) {
+    // Converting to same-width unsigned then extending preserves the bit pattern.
+    return BitUtil::Popcount(static_cast::type>(v));
+  }
+
+  /// Logical right shift for signed integer types
+  /// This is needed because the C >> operator does arithmetic right shift
+  /// Negative shift amounts lead to undefined behavior
+  template 
+  constexpr static T ShiftRightLogical(T v, int shift) {
+    // Conversion to unsigned ensures most significant bits always filled with 0's
+    return static_cast::type>(v) >> shift;
+  } 
+
+  /// Get an specific bit of a numeric type
+  template
+  static inline int8_t GetBit(T v, int bitpos) {
+    T masked = v & (static_cast(0x1) << bitpos);
+    return static_cast(ShiftRightLogical(masked, bitpos));
+  }
+
+  /// Set a specific bit to 1
+  /// Behavior when bitpos is negative is undefined
+  template 
+  constexpr static T SetBit(T v, int bitpos) {
+    return v | (static_cast(0x1) << bitpos);
+  }
+
+  /// Set a specific bit to 0
+  /// Behavior when bitpos is negative is undefined
+  template 
+  constexpr static T UnsetBit(T v, int bitpos) {
+    return v & ~(static_cast(0x1) << bitpos);
+  }
+
+  /// Returns 'value' rounded up to the nearest multiple of 'factor' when factor is
+  /// a power of two
+  static inline int64_t RoundUpToPowerOf2(int64_t value, int64_t factor) {
+    DCHECK((factor > 0) && ((factor & (factor - 1)) == 0));
+    return (value + (factor - 1)) & ~(factor - 1);
+  }
 };
 
 }
diff --git a/be/src/util/bitmap.cpp b/be/src/util/bitmap.cpp
new file mode 100644
index 0000000000..6e063be4f4
--- /dev/null
+++ b/be/src/util/bitmap.cpp
@@ -0,0 +1,47 @@
+// Modifications copyright (C) 2017, Baidu.com, Inc.
+// Copyright 2017 The Apache Software Foundation
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "util/bitmap.h"
+
+#include 
+
+using namespace palo;
+
+std::string Bitmap::DebugString(bool print_bits) const {
+  int64_t words = BitUtil::round_up(num_bits_, 64) / 64;
+  std::stringstream ss;
+  ss << "Size (" << num_bits_ << ") words (" << words << ") ";
+  if (print_bits) {
+    for (int i = 0; i < num_bits(); ++i) {
+      if (Get(i)) {
+        ss << "1";
+      } else {
+        ss << "0";
+      }
+    }
+  } else {
+    for (auto v : buffer_) {
+      ss << v << ".";
+    }
+  }
+  ss << std::endl;
+  return ss.str();
+}
+
diff --git a/be/src/util/bitmap.h b/be/src/util/bitmap.h
new file mode 100644
index 0000000000..68d180b87a
--- /dev/null
+++ b/be/src/util/bitmap.h
@@ -0,0 +1,100 @@
+// Modifications copyright (C) 2017, Baidu.com, Inc.
+// Copyright 2017 The Apache Software Foundation
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef BDG_PALO_BE_SRC_COMMON_UITL_BITMAP_H
+#define BDG_PALO_BE_SRC_COMMON_UITL_BITMAP_H
+
+#include "util/bit_util.h"
+
+namespace palo {
+
+/// Bitmap vector utility class.
+/// TODO: investigate perf.
+///  - Precomputed bitmap
+///  - Explicit Set/Unset() apis
+///  - Bigger words
+///  - size bitmap to Mersenne prime.
+class Bitmap {
+ public:
+  Bitmap(int64_t num_bits) {
+    DCHECK_GE(num_bits, 0);
+    buffer_.resize(BitUtil::round_up_numi_64(num_bits));
+    num_bits_ = num_bits;
+  }
+
+  /// Resize bitmap and set all bits to zero.
+  void Reset(int64_t num_bits) {
+    DCHECK_GE(num_bits, 0);
+    buffer_.resize(BitUtil::round_up_numi_64(num_bits));
+    num_bits_ = num_bits;
+    SetAllBits(false);
+  }
+
+  /// Compute memory usage of a bitmap, not including the Bitmap object itself.
+  static int64_t MemUsage(int64_t num_bits) {
+    DCHECK_GE(num_bits, 0);
+    return BitUtil::round_up_numi_64(num_bits) * sizeof(int64_t);
+  }
+
+  /// Compute memory usage of this bitmap, not including the Bitmap object itself.
+  int64_t MemUsage() const { return MemUsage(num_bits_); }
+
+  /// Sets the bit at 'bit_index' to v.
+  void Set(int64_t bit_index, bool v) {
+    int64_t word_index = bit_index >> NUM_OFFSET_BITS;
+    bit_index &= BIT_INDEX_MASK;
+    DCHECK_LT(word_index, buffer_.size());
+    if (v) {
+      buffer_[word_index] |= (1LL << bit_index);
+    } else {
+      buffer_[word_index] &= ~(1LL << bit_index);
+    }
+  }
+
+  /// Returns true if the bit at 'bit_index' is set.
+  bool Get(int64_t bit_index) const {
+    int64_t word_index = bit_index >> NUM_OFFSET_BITS;
+    bit_index &= BIT_INDEX_MASK;
+    DCHECK_LT(word_index, buffer_.size());
+    return (buffer_[word_index] & (1LL << bit_index)) != 0;
+  }
+
+  void SetAllBits(bool b) {
+    memset(&buffer_[0], 255 * b, buffer_.size() * sizeof(uint64_t));
+  }
+
+  int64_t num_bits() const { return num_bits_; }
+
+  /// If 'print_bits' prints 0/1 per bit, otherwise it prints the int64_t value.
+  std::string DebugString(bool print_bits) const;
+
+ private:
+  std::vector buffer_;
+  int64_t num_bits_;
+
+  /// Used for bit shifting and masking for the word and offset calculation.
+  static const int64_t NUM_OFFSET_BITS = 6;
+  static const int64_t BIT_INDEX_MASK = 63;
+};
+
+}
+
+#endif
+
diff --git a/be/src/util/brpc_stub_cache.h b/be/src/util/brpc_stub_cache.h
new file mode 100644
index 0000000000..7ec04116ec
--- /dev/null
+++ b/be/src/util/brpc_stub_cache.h
@@ -0,0 +1,72 @@
+// Copyright (c) 2018, Baidu.com, Inc. All Rights Reserved
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include 
+#include 
+
+#include "gen_cpp/Types_types.h" // TNetworkAddress
+#include "gen_cpp/internal_service.pb.h"
+#include "service/brpc.h"
+#include "util/spinlock.h"
+
+namespace palo {
+
+// map used 
+class BrpcStubCache {
+public:
+    BrpcStubCache() {
+        _stub_map.init(239);
+    }
+    ~BrpcStubCache() {
+        for (auto& stub : _stub_map) {
+            delete stub.second;
+        }
+    }
+
+    PInternalService_Stub* get_stub(const butil::EndPoint& endpoint) {
+        std::lock_guard l(_lock);
+        auto stub_ptr = _stub_map.seek(endpoint);
+        if (stub_ptr != nullptr) {
+            return *stub_ptr;
+        }
+        // new one stub and insert into map
+        brpc::ChannelOptions options;
+        std::unique_ptr channel(new brpc::Channel());
+        if (channel->Init(endpoint, &options)) {
+            return nullptr;
+        }
+        auto stub = new PInternalService_Stub(
+            channel.release(), google::protobuf::Service::STUB_OWNS_CHANNEL);
+        _stub_map.insert(endpoint, stub);
+        return stub;
+    }
+
+    PInternalService_Stub* get_stub(const TNetworkAddress& taddr) {
+        butil::EndPoint endpoint;
+        if (str2endpoint(taddr.hostname.c_str(), taddr.port, &endpoint)) {
+            LOG(WARNING) << "unknown endpoint, hostname=" << taddr.hostname;
+            return nullptr;
+        }
+        return get_stub(endpoint);
+    }
+
+private:
+    SpinLock _lock;
+    butil::FlatMap _stub_map;
+};
+
+}
diff --git a/be/src/util/cidr.cpp b/be/src/util/cidr.cpp
index 8a2b328ebf..8ef0e996f8 100644
--- a/be/src/util/cidr.cpp
+++ b/be/src/util/cidr.cpp
@@ -45,25 +45,36 @@ bool CIDR::reset(const std::string& cidr_str) {
     std::vector cidr_items;
     boost::split(cidr_items, cidr_format_str, boost::is_any_of("/"));
     if (cidr_items.size() != 2) {
-        LOG(ERROR) << "wrong CIDR format. network=" << cidr_str;
+        LOG(WARNING) << "wrong CIDR format. network=" << cidr_str;
         return false;
     }
 
     if (cidr_items[1].empty()) {
-        LOG(ERROR) << "wrong CIDR mask format. network=" << cidr_str;
+        LOG(WARNING) << "wrong CIDR mask format. network=" << cidr_str;
         return false;
     }
 
     char* endptr = nullptr;
     int32_t mask_length = strtol(cidr_items[1].c_str(), &endptr, 10);
-    if (errno != 0 || mask_length <= 0 || mask_length > 32) {
-        LOG(ERROR) << "wrong CIDR mask format. network=" << cidr_str;
+    if ((errno == ERANGE && (mask_length == LONG_MAX || mask_length == LONG_MIN)) ||
+        (errno != 0 && mask_length == 0)) {
+        char errmsg[64];
+        strerror_r(errno, errmsg, 64);
+        LOG(WARNING) << "wrong CIDR mask format. network=" << cidr_str
+            << ", mask_length=" << mask_length
+            << ", errno=" << errno
+            << ", errmsg=" << errmsg;
+        return false;
+    }
+    if (mask_length <= 0 || mask_length > 32) {
+        LOG(WARNING) << "wrong CIDR mask format. network=" << cidr_str
+            << ", mask_length=" << mask_length;
         return false;
     }
 
     uint32_t address = 0;
     if (!ip_to_int(cidr_items[0], &address)) {
-        LOG(ERROR) << "wrong CIDR IP value. network=" << cidr_str;
+        LOG(WARNING) << "wrong CIDR IP value. network=" << cidr_str;
         return false;
     }
     _address = address;
diff --git a/be/src/util/collection_metrics.h b/be/src/util/collection_metrics.h
deleted file mode 100644
index c8de9914b3..0000000000
--- a/be/src/util/collection_metrics.h
+++ /dev/null
@@ -1,294 +0,0 @@
-// Modifications copyright (C) 2017, Baidu.com, Inc.
-// Copyright 2017 The Apache Software Foundation
-
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#ifndef BDG_PALO_BE_SRC_UTIL_COLLECTION_METRICS_H
-#define BDG_PALO_BE_SRC_UTIL_COLLECTION_METRICS_H
-
-#include "util/metrics.h"
-
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-
-//#include "util/pretty-printer.h"
-
-namespace palo {
-
-/// Collection metrics are those whose values have more structure than simple
-/// scalar types. Therefore they need specialised ToJson() methods, and
-/// typically a specialised API for updating the values they contain.
-
-/// Metric whose value is a set of items
-template 
-class SetMetric : public Metric {
-public:
-    static SetMetric* CreateAndRegister(MetricGroup* metrics, const std::string& key,
-            const std::set& value) {
-        return metrics->register_metric(new SetMetric(MetricDefs::Get(key), value));
-    }
-
-    SetMetric(const TMetricDef& def, const std::set& value)
-        : Metric(def), _value(value) {
-            DCHECK_EQ(def.kind, TMetricKind::SET);
-        }
-
-    /// Put an item in this set.
-    void add(const T& item) {
-        boost::lock_guard l(_lock);
-        _value.insert(item);
-    }
-
-    /// Remove an item from this set by value.
-    void remove(const T& item) {
-        boost::lock_guard l(_lock);
-        _value.erase(item);
-    }
-
-    /// Copy out value.
-    std::set value() {
-        boost::lock_guard l(_lock);
-        return _value;
-    }
-
-    void reset() { _value.clear(); }
-
-    virtual void ToJson(rapidjson::Document* document, rapidjson::Value* value) {
-        rapidjson::Value container(rapidjson::kObjectType);
-        AddStandardFields(document, &container);
-        rapidjson::Value metric_list(rapidjson::kArrayType);
-        for (const T& s: _value) {
-            rapidjson::Value entry_value;
-            ToJsonValue(s, TUnit::NONE, document, &entry_value);
-            metric_list.PushBack(entry_value, document->GetAllocator());
-        }
-        container.AddMember("items", metric_list, document->GetAllocator());
-        *value = container;
-    }
-
-    virtual void ToLegacyJson(rapidjson::Document* document) {
-        rapidjson::Value metric_list(rapidjson::kArrayType);
-        for (const T& s: _value) {
-            rapidjson::Value entry_value;
-            ToJsonValue(s, TUnit::NONE, document, &entry_value);
-            metric_list.PushBack(entry_value, document->GetAllocator());
-        }
-        document->AddMember(rapidjson::Value(_key.c_str(), document->GetAllocator()), 
-            metric_list, document->GetAllocator());
-    }
-
-    virtual std::string ToHumanReadable() {
-        std::stringstream out;
-        //PrettyPrinter::printStringList>(
-        //    _value, TUnit::NONE, &out);
-        return out.str();
-    }
-
-    virtual void print_value(std::stringstream* out) {}
-    virtual void print_value_json(std::stringstream* out) {}
-
-private:
-    /// Lock protecting the set
-    boost::mutex _lock;
-
-    /// The set of items
-    std::set _value;
-};
-
-/// Enum to define which statistic types are available in the StatsMetric
-struct StatsType {
-    enum type {
-        MIN = 1,
-        MAX = 2,
-        MEAN = 4,
-        STDDEV = 8,
-        COUNT = 16,
-        ALL = 31
-    };
-};
-
-/// Metric which accumulates min, max and mean of all values, plus a count of samples
-/// seen. The output can be controlled by passing a bitmask as a template parameter to
-/// indicate which values should be printed or returned as JSON.
-///
-/// Printed output looks like: name: count:
-/// 4, last: 0.0141, min: 4.546e-06, max: 0.0243, mean: 0.0336, stddev: 0.0336
-///
-/// After construction, all statistics are ill-defined, but count will be 0. The first call
-/// to Update() will initialise all stats.
-template 
-class StatsMetric : public Metric {
-public:
-    static StatsMetric* CreateAndRegister(MetricGroup* metrics, const std::string& key,
-            const std::string& arg = "") {
-        return metrics->register_metric(new StatsMetric(MetricDefs::Get(key, arg)));
-    }
-
-    StatsMetric(const TMetricDef& def) : Metric(def), _unit(def.units) {
-        DCHECK_EQ(def.kind, TMetricKind::STATS);
-    }
-
-    void Update(const T& value) {
-        boost::lock_guard l(_lock);
-        _value = value;
-        _acc(value);
-    }
-
-    void Reset() {
-        boost::lock_guard l(_lock);
-        _acc = Accumulator();
-    }
-
-    virtual void ToJson(rapidjson::Document* document, rapidjson::Value* val) {
-        boost::lock_guard l(_lock);
-        rapidjson::Value container(rapidjson::kObjectType);
-        AddStandardFields(document, &container);
-        rapidjson::Value units(PrintTUnit(_unit).c_str(), document->GetAllocator());
-        container.AddMember("units", units, document->GetAllocator());
-
-        if (StatsSelection & StatsType::COUNT) {
-            container.AddMember("count",
-                    static_cast(boost::accumulators::count(_acc)),
-                    document->GetAllocator());
-        }
-
-        if (boost::accumulators::count(_acc) > 0) {
-            container.AddMember("last", _value, document->GetAllocator());
-
-            if (StatsSelection & StatsType::MIN) {
-                container.AddMember("min",
-                        static_cast(boost::accumulators::min(_acc)),
-                        document->GetAllocator());
-            }
-
-            if (StatsSelection & StatsType::MAX) {
-                container.AddMember("max", boost::accumulators::max(_acc),
-                        document->GetAllocator());
-            }
-
-            if (StatsSelection & StatsType::MEAN) {
-                container.AddMember("mean", boost::accumulators::mean(_acc),
-                        document->GetAllocator());
-            }
-
-            if (StatsSelection & StatsType::STDDEV) {
-                container.AddMember("stddev", sqrt(boost::accumulators::variance(_acc)),
-                        document->GetAllocator());
-            }
-        }
-        *val = container;
-    }
-
-    virtual void ToLegacyJson(rapidjson::Document* document) {
-        std::stringstream ss;
-        boost::lock_guard l(_lock);
-        rapidjson::Value container(rapidjson::kObjectType);
-
-        if (StatsSelection & StatsType::COUNT) {
-            container.AddMember("count", boost::accumulators::count(_acc),
-                    document->GetAllocator());
-        }
-
-        if (boost::accumulators::count(_acc) > 0) {
-            container.AddMember("last", _value, document->GetAllocator());
-            if (StatsSelection & StatsType::MIN) {
-                container.AddMember("min", boost::accumulators::min(_acc),
-                        document->GetAllocator());
-            }
-
-            if (StatsSelection & StatsType::MAX) {
-                container.AddMember("max", boost::accumulators::max(_acc),
-                        document->GetAllocator());
-            }
-
-            if (StatsSelection & StatsType::MEAN) {
-                container.AddMember("mean", boost::accumulators::mean(_acc),
-                        document->GetAllocator());
-            }
-
-            if (StatsSelection & StatsType::STDDEV) {
-                container.AddMember("stddev", sqrt(boost::accumulators::variance(_acc)),
-                        document->GetAllocator());
-            }
-        }
-        rapidjson::Value temp(_key.c_str(), document->GetAllocator());
-        document->AddMember(temp, container, document->GetAllocator());
-    }
-
-    virtual std::string ToHumanReadable() {
-        std::stringstream out;
-        if (StatsSelection & StatsType::COUNT) {
-            out << "count: " << boost::accumulators::count(_acc);
-            if (boost::accumulators::count(_acc) > 0) out << ", ";
-        }
-        if (boost::accumulators::count(_acc) > 0) {
-            out << "last: " << PrettyPrinter::print(_value, _unit);
-            if (StatsSelection & StatsType::MIN) {
-                out << ", min: " << PrettyPrinter::print(boost::accumulators::min(_acc), _unit);
-            }
-
-            if (StatsSelection & StatsType::MAX) {
-                out << ", max: " << PrettyPrinter::print(boost::accumulators::max(_acc), _unit);
-            }
-
-            if (StatsSelection & StatsType::MEAN) {
-                out << ", mean: " << PrettyPrinter::print(boost::accumulators::mean(_acc), _unit);
-            }
-
-            if (StatsSelection & StatsType::STDDEV) {
-                out << ", stddev: " << PrettyPrinter::print(
-                    sqrt(boost::accumulators::variance(_acc)), _unit);
-            }
-        }
-        return out.str();
-    }
-    virtual void print_value(std::stringstream* out) {}
-    virtual void print_value_json(std::stringstream* out) {}
-
-private:
-    /// The units of the values captured in this metric, used when pretty-printing.
-    TUnit::type _unit;
-
-    /// Lock protecting the value and the accumulator_set
-    boost::mutex _lock;
-
-    /// The last value
-    T _value;
-
-    /// The set of accumulators that update the statistics on each Update()
-    typedef boost::accumulators::accumulator_set> Accumulator;
-    Accumulator _acc;
-
-};
-
-};
-
-#endif
diff --git a/be/src/util/condition_variable.h b/be/src/util/condition_variable.h
new file mode 100755
index 0000000000..185fb341e3
--- /dev/null
+++ b/be/src/util/condition_variable.h
@@ -0,0 +1,68 @@
+// Modifications copyright (C) 2017, Baidu.com, Inc.
+// Copyright 2017 The Apache Software Foundation
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef BDG_PALO_BE_SRC_UTIL_CONDITION_VARIABLE_H
+#define BDG_PALO_BE_SRC_UTIL_CONDITION_VARIABLE_H
+
+#include 
+#include 
+#include 
+#include 
+
+namespace palo {
+
+/// Simple wrapper around POSIX pthread condition variable. This has lower overhead than
+/// boost's implementation as it doesn't implement boost thread interruption.
+class ConditionVariable {
+ public:
+  ConditionVariable() { pthread_cond_init(&cv_, NULL); }
+
+  ~ConditionVariable() { pthread_cond_destroy(&cv_); }
+
+  /// Wait indefinitely on the condition variable until it's notified.
+  inline void Wait(boost::unique_lock& lock) {
+    DCHECK(lock.owns_lock());
+    pthread_mutex_t* mutex = lock.mutex()->native_handle();
+    pthread_cond_wait(&cv_, mutex);
+  }
+
+  /// Wait until the condition variable is notified or 'timeout' has passed.
+  /// Returns true if the condition variable is notified before the absolute timeout
+  /// specified in 'timeout' has passed. Returns false otherwise.
+  inline bool TimedWait(boost::unique_lock& lock,
+      const struct timespec* timeout) {
+    DCHECK(lock.owns_lock());
+    pthread_mutex_t* mutex = lock.mutex()->native_handle();
+    return pthread_cond_timedwait(&cv_, mutex, timeout) == 0;
+  }
+
+  /// Notify a single waiter on this condition variable.
+  inline void NotifyOne() { pthread_cond_signal(&cv_); }
+
+  /// Notify all waiters on this condition variable.
+  inline void NotifyAll() { pthread_cond_broadcast(&cv_); }
+
+ private:
+  pthread_cond_t cv_;
+
+};
+
+}
+#endif
diff --git a/be/src/util/core_local.cpp b/be/src/util/core_local.cpp
new file mode 100644
index 0000000000..da8dd4a77f
--- /dev/null
+++ b/be/src/util/core_local.cpp
@@ -0,0 +1,115 @@
+// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "util/core_local.h"
+
+#include 
+#include 
+
+#include "common/logging.h"
+
+namespace palo {
+
+constexpr int BLOCK_SIZE = 4096;
+struct alignas(CACHE_LINE_SIZE) CoreDataBlock {
+    void* at(size_t offset) { return data + offset; }
+    char data[BLOCK_SIZE];
+
+    static void* operator new(size_t nbytes) {
+        void *p = nullptr;
+        if (posix_memalign(&p, alignof(CoreDataBlock), nbytes) == 0) {
+            return p;
+        }
+        throw std::bad_alloc();
+    }
+
+    static void operator delete(void* p) {
+        free(p);
+    }
+};
+
+template
+class CoreDataAllocatorImpl : public CoreDataAllocator {
+public:
+    virtual ~CoreDataAllocatorImpl();
+    void* get_or_create(size_t id) override {
+        size_t block_id = id / ELEMENTS_PER_BLOCK;
+        if (block_id >= _blocks.size()) {
+            _blocks.resize(block_id + 1);
+        }
+        CoreDataBlock* block = _blocks[block_id];
+        if (block == nullptr) {
+            block = new CoreDataBlock();
+            _blocks[block_id] = block;
+        }
+        size_t offset = (id % ELEMENTS_PER_BLOCK) * ELEMENT_BYTES;
+        return block->at(offset);
+    }
+private:
+    static constexpr int ELEMENTS_PER_BLOCK = BLOCK_SIZE / ELEMENT_BYTES;
+    std::vector _blocks;
+};
+
+template
+CoreDataAllocatorImpl::~CoreDataAllocatorImpl() {
+    for (auto block : _blocks) {
+        delete block;
+    }
+}
+
+CoreDataAllocatorFactory* CoreDataAllocatorFactory::instance() {
+    static CoreDataAllocatorFactory _s_instance;
+    return &_s_instance;
+}
+
+CoreDataAllocator* CoreDataAllocatorFactory::get_allocator(size_t cpu_idx, size_t data_bytes) {
+    std::lock_guard l(_lock);
+    auto pair = std::make_pair(cpu_idx, data_bytes);
+    auto it = _allocators.find(pair);
+    if (it != std::end(_allocators)) {
+        return it->second;
+    }
+    CoreDataAllocator* allocator = nullptr;
+    switch (data_bytes) {
+    case 1:
+        allocator = new CoreDataAllocatorImpl<1>();
+        break;
+    case 2:
+        allocator = new CoreDataAllocatorImpl<2>();
+        break;
+    case 3:
+    case 4:
+        allocator = new CoreDataAllocatorImpl<4>();
+        break;
+    case 5:
+    case 6:
+    case 7:
+    case 8:
+        allocator = new CoreDataAllocatorImpl<8>();
+        break;
+    default:
+        DCHECK(false) << "don't support core local value for this size, size=" << data_bytes;
+    }
+    _allocators.emplace(pair, allocator);
+    return allocator;
+}
+
+CoreDataAllocatorFactory::~CoreDataAllocatorFactory() {
+    for (auto& it : _allocators) {
+        delete it.second;
+    }
+}
+
+}
diff --git a/be/src/util/core_local.h b/be/src/util/core_local.h
new file mode 100644
index 0000000000..b8d058b623
--- /dev/null
+++ b/be/src/util/core_local.h
@@ -0,0 +1,142 @@
+// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "common/compiler_util.h"
+#include "gutil/macros.h"
+
+namespace palo {
+
+class CoreDataAllocator {
+public:
+    virtual ~CoreDataAllocator() { }
+    virtual void* get_or_create(size_t id) = 0;
+};
+
+class CoreDataAllocatorFactory {
+public:
+    CoreDataAllocatorFactory() { }
+    ~CoreDataAllocatorFactory();
+    CoreDataAllocator* get_allocator(size_t cpu_id, size_t data_bytes);
+    static CoreDataAllocatorFactory* instance();
+
+private:
+    DISALLOW_COPY_AND_ASSIGN(CoreDataAllocatorFactory);
+
+private:
+    std::mutex _lock;
+    std::map, CoreDataAllocator*> _allocators;
+};
+
+template
+class CoreLocalValueController {
+public:
+    CoreLocalValueController() {
+        int num_cpus = static_cast(std::thread::hardware_concurrency());
+        _size = 8;
+        while (_size < num_cpus) {
+            _size <<= 1;
+        }
+        _allocators.resize(_size, nullptr);
+        for (int i = 0; i < _size; ++i) {
+            _allocators[i] = CoreDataAllocatorFactory::instance()->get_allocator(i, sizeof(T));
+        }
+    }
+
+    ~CoreLocalValueController() { }
+
+    int get_id() {
+        std::lock_guard l(_lock);
+        int id = 0;
+        if (_free_ids.empty()) {
+            id = _next_id++;
+        } else {
+            id = _free_ids.back();
+            _free_ids.pop_back();
+        }
+        return id;
+    }
+    void reclaim_id(int id) {
+        std::lock_guard l(_lock);
+        _free_ids.push_back(id);
+    }
+    size_t size() const { return _size; }
+    CoreDataAllocator* allocator(int i) const {
+        return _allocators[i];
+    }
+
+    static CoreLocalValueController* instance() {
+        static CoreLocalValueController _s_instance;
+        return &_s_instance;
+    }
+
+private:
+    DISALLOW_COPY_AND_ASSIGN(CoreLocalValueController);
+
+private:
+    std::mutex _lock;
+    int _next_id = 0;
+    std::deque _free_ids;
+    std::vector _allocators;
+    size_t _size;
+};
+
+template
+class CoreLocalValue {
+public:
+    CoreLocalValue(const T init_value = T()) {
+        CoreLocalValueController* controller = CoreLocalValueController::instance();
+        _id = controller->get_id();
+        _size = controller->size();
+        _values.resize(_size, nullptr);
+        for (int i = 0; i < _size; ++i) {
+            void* ptr = controller->allocator(i)->get_or_create(_id);
+            _values[i] = new (ptr) T(init_value);
+        }
+    }
+
+    ~CoreLocalValue() {
+        for (int i = 0; i < _size; ++i) {
+            _values[i]->~T();
+        }
+        CoreLocalValueController::instance()->reclaim_id(_id);
+    }
+
+    inline size_t size() const { return _size; }
+    inline T* access() const {
+        size_t cpu_id = sched_getcpu();
+        if (cpu_id >= _size) {
+            cpu_id &= _size - 1;
+        }
+        return access_at_core(cpu_id);
+    }
+    inline T* access_at_core(size_t core_idx) const {
+        return _values[core_idx];
+    }
+private:
+    int _id = -1;
+    size_t _size = 0;
+    std::vector _values;
+};
+
+}
diff --git a/be/src/util/cpu_info.cpp b/be/src/util/cpu_info.cpp
old mode 100644
new mode 100755
index 9b586f1103..2c21768f7b
--- a/be/src/util/cpu_info.cpp
+++ b/be/src/util/cpu_info.cpp
@@ -20,36 +20,81 @@
 
 #include "util/cpu_info.h"
 
+#ifdef __APPLE__
+#include 
+#endif
+
+#include 
+#include 
 #include 
 #include 
+#include 
+#include 
 #include 
 #include 
 #include 
-#include 
-
 #include 
+#include 
+#include 
 
-#include "util/debug_util.h"
+#include "common/config.h"
+#include "gflags/gflags.h"
+#include "gutil/strings/substitute.h"
 #include "util/pretty_printer.h"
+#include "util/string_parser.hpp"
+
+#include "common/names.h"
+
+using boost::algorithm::contains;
+using boost::algorithm::trim;
+namespace fs = boost::filesystem;
+using std::max;
+
+DECLARE_bool(abort_on_config_error);
+DEFINE_int32(num_cores, 0, "(Advanced) If > 0, it sets the number of cores available to"
+    " Impala. Setting it to 0 means Impala will use all available cores on the machine"
+    " according to /proc/cpuinfo.");
+
+namespace palo {
+// Helper function to warn if a given file does not contain an expected string as its
+// first line. If the file cannot be opened, no error is reported.
+void WarnIfFileNotEqual(
+    const string& filename, const string& expected, const string& warning_text) {
+  ifstream file(filename);
+  if (!file) return;
+  string line;
+  getline(file, line);
+  if (line != expected) {
+    LOG(ERROR) << "Expected " << expected << ", actual " << line << endl << warning_text;
+  }
+}
+} // end anonymous namespace
 
 namespace palo {
 
-bool CpuInfo::_s_initialized = false;
-int64_t CpuInfo::_s_hardware_flags = 0;
-int64_t CpuInfo::_s_original_hardware_flags;
-long CpuInfo::_s_cache_sizes[L3_CACHE + 1];
-int64_t CpuInfo::_s_cycles_per_ms;
-int CpuInfo::_s_num_cores = 1;
-std::string CpuInfo::_s_model_name = "unknown";
+bool CpuInfo::initialized_ = false;
+int64_t CpuInfo::hardware_flags_ = 0;
+int64_t CpuInfo::original_hardware_flags_;
+int64_t CpuInfo::cycles_per_ms_;
+int CpuInfo::num_cores_ = 1;
+int CpuInfo::max_num_cores_;
+string CpuInfo::model_name_ = "unknown";
+int CpuInfo::max_num_numa_nodes_;
+unique_ptr CpuInfo::core_to_numa_node_;
+vector> CpuInfo::numa_node_to_cores_;
+vector CpuInfo::numa_node_core_idx_;
 
 static struct {
-    std::string name;
-    int64_t flag;
-} flag_mappings[] = {
-    { "ssse3",  CpuInfo::SSE3 },
-    { "sse4_1", CpuInfo::SSE4_1 },
-    { "sse4_2", CpuInfo::SSE4_2 },
-    { "popcnt", CpuInfo::POPCNT },
+  string name;
+  int64_t flag;
+} flag_mappings[] =
+{
+  { "ssse3",  CpuInfo::SSSE3 },
+  { "sse4_1", CpuInfo::SSE4_1 },
+  { "sse4_2", CpuInfo::SSE4_2 },
+  { "popcnt", CpuInfo::POPCNT },
+  { "avx",    CpuInfo::AVX },
+  { "avx2",   CpuInfo::AVX2 },
 };
 static const long num_flags = sizeof(flag_mappings) / sizeof(flag_mappings[0]);
 
@@ -57,117 +102,269 @@ static const long num_flags = sizeof(flag_mappings) / sizeof(flag_mappings[0]);
 // values contains a list of space-seperated flags.  check to see if the flags we
 // care about are present.
 // Returns a bitmap of flags.
-int64_t parse_cpu_flags(const std::string& values) {
-    int64_t flags = 0;
-
-    for (int i = 0; i < num_flags; ++i) {
-        if (boost::contains(values, flag_mappings[i].name)) {
-            flags |= flag_mappings[i].flag;
-        }
+int64_t ParseCPUFlags(const string& values) {
+  int64_t flags = 0;
+  for (int i = 0; i < num_flags; ++i) {
+    if (contains(values, flag_mappings[i].name)) {
+      flags |= flag_mappings[i].flag;
     }
-
-    return flags;
+  }
+  return flags;
 }
 
 void CpuInfo::init() {
-    std::string line;
-    std::string name;
-    std::string value;
+  string line;
+  string name;
+  string value;
 
-    float max_mhz = 0;
-    int num_cores = 0;
+  float max_mhz = 0;
+  int num_cores = 0;
 
-    memset(&_s_cache_sizes, 0, sizeof(_s_cache_sizes));
-
-    // Read from /proc/cpuinfo
-    std::ifstream cpuinfo("/proc/cpuinfo", std::ios::in);
-
-    while (cpuinfo) {
-        getline(cpuinfo, line);
-        size_t colon = line.find(':');
-
-        if (colon != std::string::npos) {
-            name = line.substr(0, colon - 1);
-            value = line.substr(colon + 1, std::string::npos);
-            boost::trim(name);
-            boost::trim(value);
-
-            if (name.compare("flags") == 0) {
-                _s_hardware_flags |= parse_cpu_flags(value);
-            } else if (name.compare("cpu MHz") == 0) {
-                // Every core will report a different speed.  We'll take the max, assuming
-                // that when impala is running, the core will not be in a lower power state.
-                // TODO: is there a more robust way to do this, such as
-                // Window's QueryPerformanceFrequency()
-                float mhz = atof(value.c_str());
-                max_mhz = std::max(mhz, max_mhz);
-            } else if (name.compare("processor") == 0) {
-                ++num_cores;
-            } else if (name.compare("model name") == 0) {
-                _s_model_name = value;
-            }
-        }
+  // Read from /proc/cpuinfo
+  ifstream cpuinfo("/proc/cpuinfo");
+  while (cpuinfo) {
+    getline(cpuinfo, line);
+    size_t colon = line.find(':');
+    if (colon != string::npos) {
+      name = line.substr(0, colon - 1);
+      value = line.substr(colon + 1, string::npos);
+      trim(name);
+      trim(value);
+      if (name.compare("flags") == 0) {
+        hardware_flags_ |= ParseCPUFlags(value);
+      } else if (name.compare("cpu MHz") == 0) {
+        // Every core will report a different speed.  We'll take the max, assuming
+        // that when impala is running, the core will not be in a lower power state.
+        // TODO: is there a more robust way to do this, such as
+        // Window's QueryPerformanceFrequency()
+        float mhz = atof(value.c_str());
+        max_mhz = max(mhz, max_mhz);
+      } else if (name.compare("processor") == 0) {
+        ++num_cores;
+      } else if (name.compare("model name") == 0) {
+        model_name_ = value;
+      }
     }
+  }
 
-    if (cpuinfo.is_open()) {
-        cpuinfo.close();
+  if (max_mhz != 0) {
+    cycles_per_ms_ = max_mhz * 1000;
+  } else {
+    cycles_per_ms_ = 1000000;
+  }
+  original_hardware_flags_ = hardware_flags_;
+
+  if (num_cores > 0) {
+    num_cores_ = num_cores;
+  } else {
+    num_cores_ = 1;
+  }
+  if (config::flags_num_cores > 0) num_cores_ = config::flags_num_cores;
+  max_num_cores_ = get_nprocs_conf();
+
+  // Print a warning if something is wrong with sched_getcpu().
+#ifdef HAVE_SCHED_GETCPU
+  if (sched_getcpu() == -1) {
+    LOG(WARNING) << "Kernel does not support getcpu(). Performance may be impacted.";
+  }
+#else
+  LOG(WARNING) << "Built on a system without sched_getcpu() support. Performance may"
+               << " be impacted.";
+#endif
+
+  _init_numa();
+  initialized_ = true;
+}
+
+void CpuInfo::_init_numa() {
+  // Use the NUMA info in the /sys filesystem. which is part of the Linux ABI:
+  // see https://www.kernel.org/doc/Documentation/ABI/stable/sysfs-devices-node and
+  // https://www.kernel.org/doc/Documentation/ABI/testing/sysfs-devices-system-cpu
+  // The filesystem entries are only present if the kernel was compiled with NUMA support.
+  core_to_numa_node_.reset(new int[max_num_cores_]);
+
+  if (!fs::is_directory("/sys/devices/system/node")) {
+    LOG(WARNING) << "/sys/devices/system/node is not present - no NUMA support";
+    // Assume a single NUMA node.
+    max_num_numa_nodes_ = 1;
+    std::fill_n(core_to_numa_node_.get(), max_num_cores_, 0);
+    _init_numa_node_to_cores();
+    return;
+  }
+
+  // Search for node subdirectories - node0, node1, node2, etc to determine possible
+  // NUMA nodes.
+  fs::directory_iterator dir_it("/sys/devices/system/node");
+  max_num_numa_nodes_ = 0;
+  for (; dir_it != fs::directory_iterator(); ++dir_it) {
+    const string filename = dir_it->path().filename().string();
+    if (filename.find("node") == 0) ++max_num_numa_nodes_;
+  }
+  if (max_num_numa_nodes_ == 0) {
+    LOG(WARNING) << "Could not find nodes in /sys/devices/system/node";
+    max_num_numa_nodes_ = 1;
+  }
+
+  // Check which NUMA node each core belongs to based on the existence of a symlink
+  // to the node subdirectory.
+  for (int core = 0; core < max_num_cores_; ++core) {
+    bool found_numa_node = false;
+    for (int node = 0; node < max_num_numa_nodes_; ++node) {
+      if (fs::exists(Substitute("/sys/devices/system/cpu/cpu$0/node$1", core, node))) {
+        core_to_numa_node_[core] = node;
+        found_numa_node = true;
+        break;
+      }
     }
-
-    // Call sysconf to query for the cache sizes
-    _s_cache_sizes[0] = sysconf(_SC_LEVEL1_DCACHE_SIZE);
-    _s_cache_sizes[1] = sysconf(_SC_LEVEL2_CACHE_SIZE);
-    _s_cache_sizes[2] = sysconf(_SC_LEVEL3_CACHE_SIZE);
-
-    if (max_mhz > 0.0) {
-        _s_cycles_per_ms = max_mhz * 1000;
-    } else {
-        _s_cycles_per_ms = 1000000;
+    if (!found_numa_node) {
+      LOG(WARNING) << "Could not determine NUMA node for core " << core
+                   << " from /sys/devices/system/cpu/";
+      core_to_numa_node_[core] = 0;
     }
+  }
+  _init_numa_node_to_cores();
+}
 
-    _s_original_hardware_flags = _s_hardware_flags;
+void CpuInfo::_init_fake_numa_for_test(
+    int max_num_numa_nodes, const vector& core_to_numa_node) {
+  DCHECK_EQ(max_num_cores_, core_to_numa_node.size());
+  max_num_numa_nodes_ = max_num_numa_nodes;
+  for (int i = 0; i < max_num_cores_; ++i) {
+    core_to_numa_node_[i] = core_to_numa_node[i];
+  }
+  numa_node_to_cores_.clear();
+  _init_numa_node_to_cores();
+}
 
-    if (num_cores > 0) {
-        _s_num_cores = num_cores;
-    } else {
-        _s_num_cores = 1;
-    }
+void CpuInfo::_init_numa_node_to_cores() {
+  DCHECK(numa_node_to_cores_.empty());
+  numa_node_to_cores_.resize(max_num_numa_nodes_);
+  numa_node_core_idx_.resize(max_num_cores_);
+  for (int core = 0; core < max_num_cores_; ++core) {
+    vector* cores_of_node = &numa_node_to_cores_[core_to_numa_node_[core]];
+    numa_node_core_idx_[core] = cores_of_node->size();
+    cores_of_node->push_back(core);
+  }
+}
 
-    _s_initialized = true;
+void CpuInfo::verify_cpu_requirements() {
+  if (!CpuInfo::is_supported(CpuInfo::SSSE3)) {
+    LOG(ERROR) << "CPU does not support the Supplemental SSE3 (SSSE3) instruction set. "
+               << "This setup is generally unsupported and Impala might be unstable.";
+  }
+}
+
+void CpuInfo::verify_performance_governor() {
+  for (int cpu_id = 0; cpu_id < CpuInfo::num_cores(); ++cpu_id) {
+    const string governor_file =
+        Substitute("/sys/devices/system/cpu/cpu$0/cpufreq/scaling_governor", cpu_id);
+    const string warning_text = Substitute(
+        "WARNING: CPU $0 is not using 'performance' governor. Note that changing the "
+        "governor to 'performance' will reset the no_turbo setting to 0.",
+        cpu_id);
+    WarnIfFileNotEqual(governor_file, "performance", warning_text);
+  }
+}
+
+void CpuInfo::verify_turbo_disabled() {
+  WarnIfFileNotEqual("/sys/devices/system/cpu/intel_pstate/no_turbo", "1",
+      "WARNING: CPU turbo is enabled. This setting can change the clock frequency of CPU "
+      "cores during the benchmark run, which can lead to inaccurate results. You can "
+      "disable CPU turbo by writing a 1 to "
+      "/sys/devices/system/cpu/intel_pstate/no_turbo. Note that changing the governor to "
+      "'performance' will reset this to 0.");
 }
 
 void CpuInfo::enable_feature(long flag, bool enable) {
-    DCHECK(_s_initialized);
+  DCHECK(initialized_);
+  if (!enable) {
+    hardware_flags_ &= ~flag;
+  } else {
+    // Can't turn something on that can't be supported
+    DCHECK((original_hardware_flags_ & flag) != 0);
+    hardware_flags_ |= flag;
+  }
+}
 
-    if (!enable) {
-        _s_hardware_flags &= ~flag;
-    } else {
-        // Can't turn something on that can't be supported
-        DCHECK((_s_original_hardware_flags & flag) != 0);
-        _s_hardware_flags |= flag;
+int CpuInfo::get_current_core() {
+  // sched_getcpu() is not supported on some old kernels/glibcs (like the versions that
+  // shipped with CentOS 5). In that case just pretend we're always running on CPU 0
+  // so that we can build and run with degraded perf.
+#ifdef HAVE_SCHED_GETCPU
+  int cpu = sched_getcpu();
+  // The syscall may not be supported even if the function exists.
+  return cpu == -1 ? 0 : cpu;
+#else
+  return 0;
+#endif
+}
+
+void CpuInfo::_get_cache_info(long cache_sizes[NUM_CACHE_LEVELS],
+      long cache_line_sizes[NUM_CACHE_LEVELS]) {
+#ifdef __APPLE__
+  // On Mac OS X use sysctl() to get the cache sizes
+  size_t len = 0;
+  sysctlbyname("hw.cachesize", NULL, &len, NULL, 0);
+  uint64_t* data = static_cast(malloc(len));
+  sysctlbyname("hw.cachesize", data, &len, NULL, 0);
+  DCHECK(len / sizeof(uint64_t) >= 3);
+  for (size_t i = 0; i < NUM_CACHE_LEVELS; ++i) {
+    cache_sizes[i] = data[i];
+  }
+  size_t linesize;
+  size_t sizeof_linesize = sizeof(linesize);
+  sysctlbyname("hw.cachelinesize", &linesize, &sizeof_linesize, NULL, 0);
+  for (size_t i = 0; i < NUM_CACHE_LEVELS; ++i) cache_line_sizes[i] = linesize;
+#else
+  // Call sysconf to query for the cache sizes
+  // Note: on some systems (e.g. RHEL 5 on AWS EC2), this returns 0 instead of the
+  // actual cache line size.
+  cache_sizes[L1_CACHE] = sysconf(_SC_LEVEL1_DCACHE_SIZE);
+  cache_sizes[L2_CACHE] = sysconf(_SC_LEVEL2_CACHE_SIZE);
+  cache_sizes[L3_CACHE] = sysconf(_SC_LEVEL3_CACHE_SIZE);
+
+  cache_line_sizes[L1_CACHE] = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
+  cache_line_sizes[L2_CACHE] = sysconf(_SC_LEVEL2_CACHE_LINESIZE);
+  cache_line_sizes[L3_CACHE] = sysconf(_SC_LEVEL3_CACHE_LINESIZE);
+#endif
+}
+
+string CpuInfo::debug_string() {
+  DCHECK(initialized_);
+  stringstream stream;
+  long cache_sizes[NUM_CACHE_LEVELS];
+  long cache_line_sizes[NUM_CACHE_LEVELS];
+  _get_cache_info(cache_sizes, cache_line_sizes);
+
+  string L1 = Substitute("L1 Cache: $0 (Line: $1)",
+      PrettyPrinter::print(cache_sizes[L1_CACHE], TUnit::BYTES),
+      PrettyPrinter::print(cache_line_sizes[L1_CACHE], TUnit::BYTES));
+  string L2 = Substitute("L2 Cache: $0 (Line: $1)",
+      PrettyPrinter::print(cache_sizes[L2_CACHE], TUnit::BYTES),
+      PrettyPrinter::print(cache_line_sizes[L2_CACHE], TUnit::BYTES));
+  string L3 = Substitute("L3 Cache: $0 (Line: $1)",
+      PrettyPrinter::print(cache_sizes[L3_CACHE], TUnit::BYTES),
+      PrettyPrinter::print(cache_line_sizes[L3_CACHE], TUnit::BYTES));
+  stream << "Cpu Info:" << endl
+         << "  Model: " << model_name_ << endl
+         << "  Cores: " << num_cores_ << endl
+         << "  Max Possible Cores: " << max_num_cores_ << endl
+         << "  " << L1 << endl
+         << "  " << L2 << endl
+         << "  " << L3 << endl
+         << "  Hardware Supports:" << endl;
+  for (int i = 0; i < num_flags; ++i) {
+    if (is_supported(flag_mappings[i].flag)) {
+      stream << "    " << flag_mappings[i].name << endl;
     }
-}
-
-std::string CpuInfo::debug_string() {
-    DCHECK(_s_initialized);
-    std::stringstream stream;
-    int64_t l1 = cache_size(L1_CACHE);
-    int64_t l2 = cache_size(L2_CACHE);
-    int64_t l3 = cache_size(L3_CACHE);
-    stream << "Cpu Info:" << std::endl
-           << "  Model: " << _s_model_name << std::endl
-           << "  Cores: " << _s_num_cores << std::endl
-           << "  L1 Cache: " << PrettyPrinter::print(l1, TUnit::BYTES) << std::endl
-           << "  L2 Cache: " << PrettyPrinter::print(l2, TUnit::BYTES) << std::endl
-           << "  L3 Cache: " << PrettyPrinter::print(l3, TUnit::BYTES) << std::endl
-           << "  Hardware Supports:" << std::endl;
-
-    for (int i = 0; i < num_flags; ++i) {
-        if (is_supported(flag_mappings[i].flag)) {
-            stream << "    " << flag_mappings[i].name << std::endl;
-        }
-    }
-
-    return stream.str();
+  }
+  stream << "  Numa Nodes: " << max_num_numa_nodes_ << endl;
+  stream << "  Numa Nodes of Cores:";
+  for (int core = 0; core < max_num_cores_; ++core) {
+    stream << " " << core << "->" << core_to_numa_node_[core] << " |";
+  }
+  stream << endl;
+  return stream.str();
 }
 
 }
diff --git a/be/src/util/cpu_info.h b/be/src/util/cpu_info.h
old mode 100644
new mode 100755
index f148fa5c20..9ab8f3303a
--- a/be/src/util/cpu_info.h
+++ b/be/src/util/cpu_info.h
@@ -18,88 +18,211 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#ifndef BDG_PALO_BE_SRC_COMMON_UTIL_CPU_INFO_H
-#define BDG_PALO_BE_SRC_COMMON_UTIL_CPU_INFO_H
 
+#ifndef BDG_PALO_BE_SRC_UTIL_CPU_INFO_H
+#define BDG_PALO_BE_SRC_UTIL_CPU_INFO_H
+
+#include 
 #include 
+#include 
 #include 
 
 #include "common/logging.h"
 
 namespace palo {
 
-// CpuInfo is an interface to query for cpu information at runtime.  The caller can
-// ask for the sizes of the caches and what hardware features are supported.
-// On Linux, this information is pulled from a couple of sys files (/proc/cpuinfo and
-// /sys/devices)
+/// CpuInfo is an interface to query for cpu information at runtime.  The caller can
+/// ask for the sizes of the caches and what hardware features are supported.
+/// On Linux, this information is pulled from a couple of sys files (/proc/cpuinfo and
+/// /sys/devices)
 class CpuInfo {
-public:
-    static const int64_t SSE3    = (1 << 1);
-    static const int64_t SSE4_1  = (1 << 2);
-    static const int64_t SSE4_2  = (1 << 3);
-    static const int64_t POPCNT  = (1 << 4);
+ public:
+  static const int64_t SSSE3   = (1 << 1);
+  static const int64_t SSE4_1  = (1 << 2);
+  static const int64_t SSE4_2  = (1 << 3);
+  static const int64_t POPCNT  = (1 << 4);
+  static const int64_t AVX     = (1 << 5);
+  static const int64_t AVX2    = (1 << 6);
 
-    // Cache enums for L1 (data), L2 and L3
-    enum CacheLevel {
-        L1_CACHE = 0,
-        L2_CACHE = 1,
-        L3_CACHE = 2,
-    };
+  /// Cache enums for L1 (data), L2 and L3
+  enum CacheLevel {
+    L1_CACHE = 0,
+    L2_CACHE = 1,
+    L3_CACHE = 2,
+  };
+  static const int NUM_CACHE_LEVELS = L3_CACHE + 1;
 
-    // Initialize CpuInfo.
-    static void init();
+  /// Initialize CpuInfo.
+  static void init();
 
-    // Returns all the flags for this cpu
-    static int64_t hardware_flags() {
-        DCHECK(_s_initialized);
-        return _s_hardware_flags;
+  /// Determine if the CPU meets the minimum CPU requirements and if not, log an error.
+  static void verify_cpu_requirements();
+
+  /// Determine if the CPU scaling governor is set to 'performance' and if not, issue an
+  /// error.
+  static void verify_performance_governor();
+
+  /// Determine if CPU turbo is disabled and if not, issue an error.
+  static void verify_turbo_disabled();
+
+  /// Returns all the flags for this cpu
+  static int64_t hardware_flags() {
+    DCHECK(initialized_);
+    return hardware_flags_;
+  }
+
+  /// Returns whether of not the cpu supports this flag
+  inline static bool is_supported(long flag) {
+    DCHECK(initialized_);
+    return (hardware_flags_ & flag) != 0;
+  }
+
+  /// Toggle a hardware feature on and off.  It is not valid to turn on a feature
+  /// that the underlying hardware cannot support. This is useful for testing.
+  static void enable_feature(long flag, bool enable);
+
+  /// Returns the number of cpu cycles per millisecond
+  static int64_t cycles_per_ms() {
+    DCHECK(initialized_);
+    return cycles_per_ms_;
+  }
+
+  /// Returns the number of cores (including hyper-threaded) on this machine that are
+  /// available for use by Impala (either the number of online cores or the value of
+  /// the --num_cores command-line flag).
+  static int num_cores() {
+    DCHECK(initialized_);
+    return num_cores_;
+  }
+
+  /// Returns the maximum number of cores that will be online in the system, including
+  /// any offline cores or cores that could be added via hot-plugging.
+  static int get_max_num_cores() { return max_num_cores_; }
+
+  /// Returns the core that the current thread is running on. Always in range
+  /// [0, GetMaxNumCores()). Note that the thread may be migrated to a different core
+  /// at any time by the scheduler, so the caller should not assume the answer will
+  /// remain stable.
+  static int get_current_core();
+
+  /// Returns the maximum number of NUMA nodes that will be online in the system,
+  /// including any that may be offline or disabled.
+  static int get_max_num_numa_nodes() { return max_num_numa_nodes_; }
+
+  /// Returns the NUMA node of the core provided. 'core' must be in the range
+  /// [0, GetMaxNumCores()).
+  static int get_numa_node_of_core(int core) {
+    DCHECK_LE(0, core);
+    DCHECK_LT(core, max_num_cores_);
+    return core_to_numa_node_[core];
+  }
+
+  /// Returns the cores in a NUMA node. 'node' must be in the range
+  /// [0, GetMaxNumNumaNodes()).
+  static const std::vector& get_cores_of_numa_node(int node) {
+    DCHECK_LE(0, node);
+    DCHECK_LT(node, max_num_numa_nodes_);
+    return numa_node_to_cores_[node];
+  }
+
+  /// Returns the cores in the same NUMA node as 'core'. 'core' must be in the range
+  /// [0, GetMaxNumCores()).
+  static const std::vector& get_cores_of_same_numa_node(int core) {
+    DCHECK_LE(0, core);
+    DCHECK_LT(core, max_num_cores_);
+    return get_cores_of_numa_node(get_numa_node_of_core(core));
+  }
+
+  /// Returns the index of the given core within the vector returned by
+  /// GetCoresOfNumaNode() and GetCoresOfSameNumaNode(). 'core' must be in the range
+  /// [0, GetMaxNumCores()).
+  static int get_numa_node_core_idx(int core) {
+    DCHECK_LE(0, core);
+    DCHECK_LT(core, max_num_cores_);
+    return numa_node_core_idx_[core];
+  }
+
+  /// Returns the model name of the cpu (e.g. Intel i7-2600)
+  static std::string model_name() {
+    DCHECK(initialized_);
+    return model_name_;
+  }
+
+  static std::string debug_string();
+
+  /// A utility class for temporarily disabling CPU features. Usage:
+  ///
+  /// {
+  ///   CpuInfo::TempDisable disabler(CpuInfo::AVX2);
+  ///   // On the previous line, the constructor disables AVX2 instructions. On the next
+  ///   // line, CpuInfo::IsSupported(CpuInfo::AVX2) will return false.
+  ///   SomeOperation();
+  ///   // On the next line, the block closes, 'disabler's destructor runs, and AVX2
+  ///   // instructions are re-enabled.
+  /// }
+  ///
+  /// TempDisable's destructor never re-enables features that were not enabled when then
+  /// constructor ran.
+  struct TempDisable {
+    TempDisable(int64_t feature)
+      : feature_(feature), reenable_(CpuInfo::is_supported(feature)) {
+      CpuInfo::enable_feature(feature_, false);
+    }
+    ~TempDisable() {
+      if (reenable_) {
+        CpuInfo::enable_feature(feature_, true);
+      }
     }
 
-    // Returns whether of not the cpu supports this flag
-    inline static bool is_supported(long flag) {
-        DCHECK(_s_initialized);
-        return (_s_hardware_flags & flag) != 0;
-    }
+   private:
+    int64_t feature_;
+    bool reenable_;
+  };
 
-    // Toggle a hardware feature on and off.  It is not valid to turn on a feature
-    // that the underlying hardware cannot support. This is useful for testing.
-    static void enable_feature(long flag, bool enable);
+ protected:
+  friend class CpuTestUtil;
 
-    // Returns the size of the cache in KB at this cache level
-    static long cache_size(CacheLevel level) {
-        DCHECK(_s_initialized);
-        return _s_cache_sizes[level];
-    }
+  /// Setup fake NUMA info to simulate NUMA for backend tests. Sets up CpuInfo to
+  /// simulate 'max_num_numa_nodes' with 'core_to_numa_node' specifying the NUMA node
+  /// of each core in [0, GetMaxNumCores()).
+  static void _init_fake_numa_for_test(
+      int max_num_numa_nodes, const std::vector& core_to_numa_node);
 
-    // Returns the number of cpu cycles per millisecond
-    static int64_t cycles_per_ms() {
-        DCHECK(_s_initialized);
-        return _s_cycles_per_ms;
-    }
+ private:
+  /// Initialize NUMA-related state - called from Init();
+  static void _init_numa();
 
-    // Returns the number of cores (including hyper-threaded) on this machine.
-    static int num_cores() {
-        DCHECK(_s_initialized);
-        return _s_num_cores;
-    }
+  /// Initialize 'numa_node_to_cores_' based on 'max_num_numa_nodes_' and
+  /// 'core_to_numa_node_'. Called from InitNuma();
+  static void _init_numa_node_to_cores();
 
-    // Returns the model name of the cpu (e.g. Intel i7-2600)
-    static std::string model_name() {
-        DCHECK(_s_initialized);
-        return _s_model_name;
-    }
+  /// Populates the arguments with information about this machine's caches.
+  /// The values returned are not reliable in some environments, e.g. RHEL5 on EC2, so
+  /// so we will keep this as a private method.
+  static void _get_cache_info(long cache_sizes[NUM_CACHE_LEVELS],
+      long cache_line_sizes[NUM_CACHE_LEVELS]);
 
-    static std::string debug_string();
+  static bool initialized_;
+  static int64_t hardware_flags_;
+  static int64_t original_hardware_flags_;
+  static int64_t cycles_per_ms_;
+  static int num_cores_;
+  static int max_num_cores_;
+  static std::string model_name_;
 
-private:
-    static bool _s_initialized;
-    static int64_t _s_hardware_flags;
-    static int64_t _s_original_hardware_flags;
-    static long _s_cache_sizes[L3_CACHE + 1];
-    static int64_t _s_cycles_per_ms;
-    static int _s_num_cores;
-    static std::string _s_model_name;
+  /// Maximum possible number of NUMA nodes.
+  static int max_num_numa_nodes_;
+
+  /// Array with 'max_num_cores_' entries, each of which is the NUMA node of that core.
+  static std::unique_ptr core_to_numa_node_;
+
+  /// Vector with 'max_num_numa_nodes_' entries, each of which is a vector of the cores
+  /// belonging to that NUMA node.
+  static std::vector> numa_node_to_cores_;
+
+  /// Array with 'max_num_cores_' entries, each of which is the index of that core in its
+  /// NUMA node.
+  static std::vector numa_node_core_idx_;
 };
-
 }
 #endif
diff --git a/be/src/util/date_func.h b/be/src/util/date_func.h
new file mode 100644
index 0000000000..3f6f987888
--- /dev/null
+++ b/be/src/util/date_func.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef BDG_PALO_BE_SRC_UTIL_DATE_FUNC_H
+#define BDG_PALO_BE_SRC_UTIL_DATE_FUNC_H
+
+#include 
+#include 
+#include 
+
+#include "olap/field.h"
+
+namespace palo {
+
+static uint64_t timestamp_from_datetime(const std::string& datetime_str) {
+    tm time_tm;
+    char* res = strptime(datetime_str.c_str(), "%Y-%m-%d %H:%M:%S", &time_tm);
+
+    uint64_t value = 0;
+    if (NULL != res) {
+        value = ((time_tm.tm_year + 1900) * 10000L
+                + (time_tm.tm_mon + 1) * 100L
+                + time_tm.tm_mday) * 1000000L
+            + time_tm.tm_hour * 10000L
+            + time_tm.tm_min * 100L
+            + time_tm.tm_sec;
+    } else {
+        // 1400 - 01 - 01
+        value = 14000101000000;
+    }
+
+    return value;
+}
+
+static uint24_t timestamp_from_date(const std::string& date_str) {
+    tm time_tm;
+    char* res = strptime(date_str.c_str(), "%Y-%m-%d", &time_tm);
+
+    int value = 0;
+    if (NULL != res) {
+        value = (time_tm.tm_year + 1900) * 16 * 32
+            + (time_tm.tm_mon + 1) * 32
+            + time_tm.tm_mday;
+    } else {
+        // 1400 - 01 - 01
+        value = 716833;
+    }
+
+    return uint24_t(value);
+}
+
+}  // namespace palo
+
+#endif // BDG_PALO_BE_SRC_UTIL_DATE_FUNC_H
diff --git a/be/src/util/debug_util.cpp b/be/src/util/debug_util.cpp
index d47afb403d..480b0f1a89 100644
--- a/be/src/util/debug_util.cpp
+++ b/be/src/util/debug_util.cpp
@@ -33,6 +33,7 @@
 #include "runtime/row_batch.h"
 #include "util/cpu_info.h"
 #include "gen_cpp/Opcodes_types.h"
+#include "gen_cpp/types.pb.h"
 
 #define PRECISION 2
 #define KILOBYTE (1024)
@@ -96,6 +97,12 @@ std::string print_id(const TUniqueId& id) {
     return out.str();
 }
 
+std::string print_id(const PUniqueId& id) {
+    std::stringstream out;
+    out << std::hex << id.hi() << ":" << id.lo();
+    return out.str();
+}
+
 bool parse_id(const std::string& s, TUniqueId* id) {
     DCHECK(id != NULL);
 
@@ -224,4 +231,13 @@ std::string get_stack_trace() {
     return s;
 }
 
+std::string hexdump(const char* buf, int len) {
+    std::stringstream ss;
+    ss << std::hex << std::uppercase;
+    for (int i = 0; i < len; ++i) {
+        ss << std::setfill('0') << std::setw(2) << ((uint16_t)buf[i] & 0xff);
+    }
+    return ss.str();
+}
+
 }
diff --git a/be/src/util/debug_util.h b/be/src/util/debug_util.h
index cc60991183..4a1f9f4fae 100644
--- a/be/src/util/debug_util.h
+++ b/be/src/util/debug_util.h
@@ -40,11 +40,13 @@ class TupleDescriptor;
 class Tuple;
 class TupleRow;
 class RowBatch;
+class PUniqueId;
 
 std::string print_tuple(const Tuple* t, const TupleDescriptor& d);
 std::string print_row(TupleRow* row, const RowDescriptor& d);
 std::string print_batch(RowBatch* batch);
 std::string print_id(const TUniqueId& id);
+std::string print_id(const PUniqueId& id);
 std::string print_plan_node_type(const TPlanNodeType::type& type);
 std::string print_tstmt_type(const TStmtType::type& type);
 std::string print_query_state(const QueryState::type& type);
@@ -69,6 +71,8 @@ std::string get_version_string(bool compact);
 // for recursive calls.
 std::string get_stack_trace();
 
+std::string hexdump(const char* buf, int len);
+
 }
 
 #endif
diff --git a/be/src/util/disk_info.cpp b/be/src/util/disk_info.cpp
index e5b365b1f5..0d8c6c731f 100644
--- a/be/src/util/disk_info.cpp
+++ b/be/src/util/disk_info.cpp
@@ -159,4 +159,63 @@ std::string DiskInfo::debug_string() {
     return stream.str();
 }
 
+Status DiskInfo::get_disk_devices(const std::vector& paths,
+                                  std::set* devices) {
+    FILE* fp = fopen("/proc/mounts", "r");
+    if (fp == nullptr) {
+        std::stringstream ss;
+        char buf[64];
+        ss << "open /proc/mounts failed, errno:" << errno
+            << ", message:" << strerror_r(errno, buf, 64);
+        LOG(WARNING) << ss.str();
+        return Status(ss.str());
+    }
+
+    Status status;
+    char* line_ptr = 0;
+    size_t line_buf_size = 0;
+    for (auto& path : paths) {
+        size_t max_mount_size = 0;
+        std::string match_dev;
+        rewind(fp);
+        while (getline(&line_ptr, &line_buf_size, fp) > 0)  {
+            char dev_path[4096];
+            char mount_path[4096];
+            int num = sscanf(line_ptr, "%4095s %4095s", dev_path, mount_path);
+            if (num < 2) {
+                continue;
+            }
+            size_t mount_size = strlen(mount_path);
+            if (mount_size < max_mount_size ||
+                path.size() < mount_size ||
+                strncmp(path.c_str(), mount_path, mount_size) != 0) {
+                continue;
+            }
+            std::string dev(basename(dev_path));
+            boost::trim_right_if(dev, boost::is_any_of("0123456789"));
+            if (_s_disk_name_to_disk_id.find(dev) != std::end(_s_disk_name_to_disk_id)) {
+                max_mount_size = mount_size;
+                match_dev = dev;
+            }
+        }
+        if (ferror(fp) != 0) {
+            std::stringstream ss;
+            char buf[64];
+            ss << "open /proc/mounts failed, errno:" << errno
+                << ", message:" << strerror_r(errno, buf, 64);
+            LOG(WARNING) << ss.str();
+            status = Status(ss.str());
+            break;
+        }
+        if (max_mount_size > 0) {
+            devices->emplace(match_dev);
+        }
+    }
+    if (line_ptr != nullptr) {
+        free(line_ptr);
+    }
+    fclose(fp);
+    return status;
+}
+
 }
diff --git a/be/src/util/disk_info.h b/be/src/util/disk_info.h
index c505d7dcf6..3f360b1e8e 100644
--- a/be/src/util/disk_info.h
+++ b/be/src/util/disk_info.h
@@ -23,9 +23,11 @@
 
 #include 
 #include 
+#include 
 
 #include 
 #include "common/logging.h"
+#include "common/status.h"
 
 namespace palo {
 
@@ -80,6 +82,10 @@ public:
 
     static std::string debug_string();
 
+    // get disk devices of given path
+    static Status get_disk_devices(const std::vector& paths,
+                                   std::set* devices);
+
 private:
     static bool _s_initialized;
 
diff --git a/be/src/util/dummy_runtime_profile.h b/be/src/util/dummy_runtime_profile.h
new file mode 100755
index 0000000000..b08650b804
--- /dev/null
+++ b/be/src/util/dummy_runtime_profile.h
@@ -0,0 +1,41 @@
+// Modifications copyright (C) 2017, Baidu.com, Inc.
+// Copyright 2017 The Apache Software Foundation
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef BDG_PALO_BE_SRC_UTIL_DEBUG_RUNTIME_PROFILE_H
+#define BDG_PALO_BE_SRC_UTIL_DEBUG_RUNTIME_PROFILE_H
+
+#include "common/object_pool.h"
+#include "util/runtime_profile.h"
+
+namespace palo {
+class DummyProfile {
+public:
+    DummyProfile() : _pool(), _profile(new RuntimeProfile(&_pool, "dummy", false)) {}
+    RuntimeProfile* profile() { return _profile; }
+    virtual ~DummyProfile() {
+        delete _profile;
+    }
+
+private:
+    ObjectPool _pool;
+    RuntimeProfile* const _profile;
+};
+}
+#endif
diff --git a/be/src/util/fake_lock.h b/be/src/util/fake_lock.h
new file mode 100644
index 0000000000..a15fb69b78
--- /dev/null
+++ b/be/src/util/fake_lock.h
@@ -0,0 +1,43 @@
+// Modifications copyright (C) 2017, Baidu.com, Inc.
+// Copyright 2017 The Apache Software Foundation
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef BDG_PALO_BE_SRC_UTIL_FAKE_LOCK_H
+#define BDG_PALO_BE_SRC_UTIL_FAKE_LOCK_H
+
+#include "gutil/macros.h"
+
+namespace palo {
+
+// Implementation of Boost's lockable interface that does nothing. Used to replace an
+// actual lock implementation in template classes in if no thread safety is needed.
+class FakeLock {
+public:
+    FakeLock() {}
+    void lock() {}
+    void unlock() {}
+    bool try_lock() { return true; }
+
+private:
+    DISALLOW_COPY_AND_ASSIGN(FakeLock);
+};
+
+} // namespace palo
+
+#endif // BDG_PALO_BE_SRC_UTIL_FAKE_LOCK_H
diff --git a/be/src/util/hash_util.hpp b/be/src/util/hash_util.hpp
index 26cd1e486b..6da0ce5377 100644
--- a/be/src/util/hash_util.hpp
+++ b/be/src/util/hash_util.hpp
@@ -299,12 +299,15 @@ struct hash {
     }
 };
 
+#if !defined(IR_COMPILE) && __GNUC__ < 6
+// Cause this is builtin function
 template<>
 struct hash<__int128> {
     std::size_t operator()(const __int128& val) const {
         return palo::HashUtil::hash(&val, sizeof(val), 0);
     }
 };
+#endif
 
 }
 
diff --git a/be/src/util/internal_queue.h b/be/src/util/internal_queue.h
index 393d374015..6bd44145f3 100644
--- a/be/src/util/internal_queue.h
+++ b/be/src/util/internal_queue.h
@@ -18,269 +18,270 @@
 // specific language governing permissions and limitations
 // under the License.
 
+
 #ifndef BDG_PALO_BE_SRC_UTIL_INTERNAL_QUEUE_H
 #define BDG_PALO_BE_SRC_UTIL_INTERNAL_QUEUE_H
 
+#include 
 #include 
 
-#include "common/atomic.h"
+#include "util/fake_lock.h"
 #include "util/spinlock.h"
 
 namespace palo {
 
-// Thread safe fifo-queue. This is an internal queue, meaning the links to nodes
-// are maintained in the object itself. This is in contrast to the stl list which
-// allocates a wrapper Node object around the data. Since it's an internal queue,
-// the list pointers are maintained in the Nodes which is memory owned by the user.
-// The nodes cannot be deallocated while the queue has elements.
-// To use: subclass InternalQueue::Node.
-// The internal structure is a doubly-linked list.
-//  NULL <-- N1 <--> N2 <--> N3 --> NULL
-//          (head)          (tail)
-// TODO: this is an ideal candidate to be made lock free.
+/// FIFO queue implemented as a doubly-linked lists with internal pointers. This is in
+/// contrast to the STL list which allocates a wrapper Node object around the data. Since
+/// it's an internal queue, the list pointers are maintained in the Nodes which is memory
+/// owned by the user. The nodes cannot be deallocated while the queue has elements.
+/// The internal structure is a doubly-linked list.
+///  NULL <-- N1 <--> N2 <--> N3 --> NULL
+///          (head)          (tail)
+///
+/// InternalQueue instantiates a thread-safe queue where the queue is protected by an
+/// internal Spinlock. InternalList instantiates a list with no thread safety.
+///
+/// To use these data structures, the element to be added to the queue or list must
+/// subclass ::Node.
+///
+/// TODO: this is an ideal candidate to be made lock free.
 
-// T must be a subclass of InternalQueue::Node
-template
-class InternalQueue {
-public:
-    class Node {
-    public:
-        Node() : _parent_queue(NULL), _next(NULL), _prev(NULL) {}
-        virtual ~Node() {}
+/// T must be a subclass of InternalQueueBase::Node.
+template 
+class InternalQueueBase {
+ public:
+  struct Node {
+   public:
+    Node() : parent_queue(NULL), next_node(NULL), prev_node(NULL) {}
+    virtual ~Node() {}
 
-        // Returns the Next/Prev node or NULL if this is the end/front.
-        T* next() const {
-            boost::lock_guard lock(_parent_queue->_lock);
-            return reinterpret_cast(_next);
-        }
-        T* prev() const {
-            boost::lock_guard lock(_parent_queue->_lock);
-            return reinterpret_cast(_prev);
-        }
+    /// Returns true if the node is in a queue.
+    bool in_queue() const { return parent_queue != NULL; }
 
-    private:
-        friend class InternalQueue;
-
-        // Pointer to the queue this Node is on. NULL if not on any queue.
-        InternalQueue* _parent_queue;
-        Node* _next;
-        Node* _prev;
-    };
-
-    InternalQueue() : _head(NULL), _tail(NULL), _size(0) {}
-
-    ~InternalQueue() {
-        // do nothing
+    /// Returns the Next/Prev node or NULL if this is the end/front.
+    T* next() const {
+      boost::lock_guard lock(parent_queue->lock_);
+      return reinterpret_cast(next_node);
+    }
+    T* prev() const {
+      boost::lock_guard lock(parent_queue->lock_);
+      return reinterpret_cast(prev_node);
     }
 
-    // Returns the element at the head of the list without dequeuing or NULL
-    // if the queue is empty. This is O(1).
-    T* head() const {
-        boost::lock_guard lock(_lock);
-        if (empty()) {
-            return NULL;
-        }
-        return reinterpret_cast(_head);
+   private:
+    friend class InternalQueueBase;
+
+    /// Pointer to the queue this Node is on. NULL if not on any queue.
+    InternalQueueBase* parent_queue;
+    Node* next_node;
+    Node* prev_node;
+  };
+
+  InternalQueueBase() : head_(NULL), tail_(NULL), size_(0) {}
+
+  /// Returns the element at the head of the list without dequeuing or NULL
+  /// if the queue is empty. This is O(1).
+  T* head() const {
+    boost::lock_guard lock(lock_);
+    if (empty()) return NULL;
+    return reinterpret_cast(head_);
+  }
+
+  /// Returns the element at the end of the list without dequeuing or NULL
+  /// if the queue is empty. This is O(1).
+  T* tail() {
+    boost::lock_guard lock(lock_);
+    if (empty()) return NULL;
+    return reinterpret_cast(tail_);
+  }
+
+  /// Enqueue node onto the queue's tail. This is O(1).
+  void enqueue(T* n) {
+    Node* node = (Node*)n;
+    DCHECK(node->next_node == NULL);
+    DCHECK(node->prev_node == NULL);
+    DCHECK(node->parent_queue == NULL);
+    node->parent_queue = this;
+    {
+      boost::lock_guard lock(lock_);
+      if (tail_ != NULL) tail_->next_node = node;
+      node->prev_node = tail_;
+      tail_ = node;
+      if (head_ == NULL) head_ = node;
+      ++size_;
     }
+  }
 
-    // Returns the element at the end of the list without dequeuing or NULL
-    // if the queue is empty. This is O(1).
-    T* tail() {
-        boost::lock_guard lock(_lock);
-        if (empty()) {
-            return NULL;
-        }
-        return reinterpret_cast(_tail);
+  /// Dequeues an element from the queue's head. Returns NULL if the queue
+  /// is empty. This is O(1).
+  T* dequeue() {
+    Node* result = NULL;
+    {
+      boost::lock_guard lock(lock_);
+      if (empty()) return NULL;
+      --size_;
+      result = head_;
+      head_ = head_->next_node;
+      if (head_ == NULL) {
+        tail_ = NULL;
+      } else {
+        head_->prev_node = NULL;
+      }
     }
+    DCHECK(result != NULL);
+    result->next_node = result->prev_node = NULL;
+    result->parent_queue = NULL;
+    return reinterpret_cast(result);
+  }
 
-    // Enqueue node onto the queue's tail. This is O(1).
-    void enqueue(T* n) {
-        Node* node = (Node*)n;
-        DCHECK(node->_next == NULL);
-        DCHECK(node->_prev == NULL);
-        DCHECK(node->_parent_queue == NULL);
-        node->_parent_queue = this;
-        {
-            boost::lock_guard lock(_lock);
-            if (_tail != NULL) {
-                _tail->_next = node;
-            }
-            node->_prev = _tail;
-            _tail = node;
-            if (_head == NULL) {
-                _head = node;
-            }
-            ++_size;
-        }
+  /// Dequeues an element from the queue's tail. Returns NULL if the queue
+  /// is empty. This is O(1).
+  T* pop_back() {
+    Node* result = NULL;
+    {
+      boost::lock_guard lock(lock_);
+      if (empty()) return NULL;
+      --size_;
+      result = tail_;
+      tail_ = tail_->prev_node;
+      if (tail_ == NULL) {
+        head_ = NULL;
+      } else {
+        tail_->next_node = NULL;
+      }
     }
+    DCHECK(result != NULL);
+    result->next_node = result->prev_node = NULL;
+    result->parent_queue = NULL;
+    return reinterpret_cast(result);
+  }
 
-    // Dequeues an element from the queue's head. Returns NULL if the queue
-    // is empty. This is O(1).
-    T* dequeue() {
-        Node* result = NULL;
-        {
-            boost::lock_guard lock(_lock);
-            if (empty()) {
-                return NULL;
-            }
-            --_size;
-            result = _head;
-            _head = _head->_next;
-            if (_head == NULL) {
-                _tail = NULL;
-            } else {
-                _head->_prev = NULL;
-            }
-        }
-        DCHECK(result != NULL);
-        result->_next = result->_prev = NULL;
-        result->_parent_queue = NULL;
-        return reinterpret_cast(result);
-    }
-
-    // Dequeues an element from the queue's tail. Returns NULL if the queue
-    // is empty. This is O(1).
-    T* pop_back() {
-        Node* result = NULL;
-        {
-            boost::lock_guard lock(_lock);
-            if (empty()) {
-                return NULL;
-            }
-            --_size;
-            result = _tail;
-            _tail = _tail->_prev;
-            if (_tail == NULL) {
-                _head = NULL;
-            } else {
-                _tail->_next = NULL;
-            }
-        }
-        DCHECK(result != NULL);
-        result->_next = result->_prev = NULL;
-        result->_parent_queue = NULL;
-        return reinterpret_cast(result);
-    }
-
-    // Removes 'node' from the queue. This is O(1). No-op if node is
-    // not on the list.
-    void remove(T* n) {
-        Node* node = (Node*)n;
-        if (node->_parent_queue == NULL) {
-            return;
-        }
-        DCHECK(node->_parent_queue == this);
-        {
-            boost::lock_guard lock(_lock);
-            if (node->_next == NULL && node->_prev == NULL) {
-                // Removing only node
-                DCHECK(node == _head);
-                DCHECK(_tail == node);
-                _head = _tail = NULL;
-                --_size;
-                node->_parent_queue = NULL;
-                return;
-            }
-
-            if (_head == node) {
-                DCHECK(node->_prev == NULL);
-                _head = node->_next;
-            } else {
-                DCHECK(node->_prev != NULL);
-                node->_prev->_next = node->_next;
-            }
-
-            if (node == _tail) {
-                DCHECK(node->_next == NULL);
-                _tail = node->_prev;
-            } else if (node->_next != NULL) {
-                node->_next->_prev = node->_prev;
-            }
-            --_size;
-        }
-        node->_next = node->_prev = NULL;
-        node->_parent_queue = NULL;
-    }
-
-    // Clears all elements in the list.
-    void clear() {
-        boost::lock_guard lock(_lock);
-        Node* cur = _head;
-        while (cur != NULL) {
-            Node* tmp = cur;
-            cur = cur->_next;
-            tmp->_prev = tmp->_next = NULL;
-            tmp->_parent_queue = NULL;
-        }
-        _size = 0;
-        _head = _tail = NULL;
-    }
-
-    int size() const {
-        return _size;
-    }
-    bool empty() const {
-        return _head == NULL;
-    }
-
-    // Returns if the target is on the queue. This is O(1) and intended to
-    // be used for debugging.
-    bool contains(const T* target) const {
-        return target->_parent_queue == this;
-    }
-
-    // Validates the internal structure of the list
-    bool validate() {
-        int num_elements_found = 0;
-        boost::lock_guard lock(_lock);
-        if (_head == NULL) {
-            if (_tail != NULL) return false;
-            if (size() != 0) return false;
-            return true;
-        }
-
-        if (_head->_prev != NULL) return false;
-        Node* current = _head;
-        while (current != NULL) {
-            if (current->_parent_queue != this) return false;
-            ++num_elements_found;
-            Node* next = current->_next;
-            if (next == NULL) {
-                if (current != _tail) return false;
-            } else {
-                if (next->_prev != current) return false;
-            }
-            current = next;
-        }
-        if (num_elements_found != size()) return false;
+  /// Removes 'node' from the queue. This is O(1). No-op if node is
+  /// not on the list. Returns true if removed
+  bool remove(T* n) {
+    Node* node = (Node*)n;
+    if (node->parent_queue != this) return false;
+    {
+      boost::lock_guard lock(lock_);
+      if (node->next_node == NULL && node->prev_node == NULL) {
+        // Removing only node
+        DCHECK(node == head_);
+        DCHECK(tail_ == node);
+        head_ = tail_ = NULL;
+        --size_;
+        node->parent_queue = NULL;
         return true;
+      }
+
+      if (head_ == node) {
+        DCHECK(node->prev_node == NULL);
+        head_ = node->next_node;
+      } else {
+        DCHECK(node->prev_node != NULL);
+        node->prev_node->next_node = node->next_node;
+      }
+
+      if (node == tail_) {
+        DCHECK(node->next_node == NULL);
+        tail_ = node->prev_node;
+      } else if (node->next_node != NULL) {
+        node->next_node->prev_node = node->prev_node;
+      }
+      --size_;
+    }
+    node->next_node = node->prev_node = NULL;
+    node->parent_queue = NULL;
+    return true;
+  }
+
+  /// Clears all elements in the list.
+  void clear() {
+    boost::lock_guard lock(lock_);
+    Node* cur = head_;
+    while (cur != NULL) {
+      Node* tmp = cur;
+      cur = cur->next_node;
+      tmp->prev_node = tmp->next_node = NULL;
+      tmp->parent_queue = NULL;
+    }
+    size_ = 0;
+    head_ = tail_ = NULL;
+  }
+
+  int size() const { return size_; }
+  bool empty() const { return head_ == NULL; }
+
+  /// Returns if the target is on the queue. This is O(1) and does not acquire any locks.
+  bool contains(const T* target) const {
+    return target->parent_queue == this;
+  }
+
+  /// Validates the internal structure of the list
+  bool validate() {
+    int num_elements_found = 0;
+    boost::lock_guard lock(lock_);
+    if (head_ == NULL) {
+      if (tail_ != NULL) return false;
+      if (size() != 0) return false;
+      return true;
     }
 
-    // Prints the queue ptrs to a string.
-    std::string debug_string() {
-        std::stringstream ss;
-        ss << "(";
-        {
-            boost::lock_guard lock(_lock);
-            Node* curr = _head;
-            while (curr != NULL) {
-                ss << (void*)curr;
-                curr = curr->_next;
-            }
-        }
-        ss << ")";
-        return ss.str();
+    if (head_->prev_node != NULL) return false;
+    Node* current = head_;
+    while (current != NULL) {
+      if (current->parent_queue != this) return false;
+      ++num_elements_found;
+      Node* next_node = current->next_node;
+      if (next_node == NULL) {
+        if (current != tail_) return false;
+      } else {
+        if (next_node->prev_node != current) return false;
+      }
+      current = next_node;
     }
+    if (num_elements_found != size()) return false;
+    return true;
+  }
 
-private:
-    friend struct Node;
-    mutable SpinLock _lock;
-    Node* _head;
-    Node* _tail;
-    int _size;
+  // Iterate over elements of queue, calling 'fn' for each element. If 'fn' returns
+  // false, terminate iteration. It is invalid to call other InternalQueue methods
+  // from 'fn'.
+  void iterate(boost::function fn) {
+    boost::lock_guard lock(lock_);
+    for (Node* current = head_; current != NULL; current = current->next_node) {
+      if (!fn(reinterpret_cast(current))) return;
+    }
+  }
+
+  /// Prints the queue ptrs to a string.
+  std::string DebugString() {
+    std::stringstream ss;
+    ss << "(";
+    {
+      boost::lock_guard lock(lock_);
+      Node* curr = head_;
+      while (curr != NULL) {
+        ss << (void*)curr;
+        curr = curr->next_node;
+      }
+    }
+    ss << ")";
+    return ss.str();
+  }
+
+ private:
+  friend struct Node;
+  mutable LockType lock_;
+  Node *head_, *tail_;
+  int size_;
 };
 
-} // end namespace palo
-
-#endif // BDG_PALO_BE_SRC_UTIL_INTERNAL_QUEUE_H
+// The default LockType is SpinLock.
+template 
+class InternalQueue : public InternalQueueBase {};
 
+// InternalList is a non-threadsafe implementation.
+template 
+class InternalList : public InternalQueueBase {};
+}
+#endif
diff --git a/be/src/util/mem_range.h b/be/src/util/mem_range.h
new file mode 100755
index 0000000000..af8218d7bf
--- /dev/null
+++ b/be/src/util/mem_range.h
@@ -0,0 +1,50 @@
+// Modifications copyright (C) 2017, Baidu.com, Inc.
+// Copyright 2017 The Apache Software Foundation
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef BDG_PALO_BE_SRC_UTIL_MEM_RANGE_H
+#define BDG_PALO_BE_SRC_UTIL_MEM_RANGE_H
+
+#include 
+
+#include "common/logging.h"
+
+namespace palo {
+
+/// Represents a range of memory. This is a convenient alternative to passing around
+/// a separate pointer and length.
+class MemRange {
+ public:
+  MemRange(uint8_t* data, int64_t len) : data_(data), len_(len) {
+    DCHECK_GE(len, 0);
+    DCHECK(len == 0 || data != nullptr);
+  }
+
+  uint8_t* data() const { return data_; }
+  int64_t len() const { return len_; }
+
+  static MemRange null() { return MemRange(nullptr, 0); }
+
+ private:
+  uint8_t* data_;
+  int64_t len_;
+};
+}
+
+#endif
diff --git a/be/src/util/metrics.cpp b/be/src/util/metrics.cpp
index ff63dd6da8..1190fbd53b 100644
--- a/be/src/util/metrics.cpp
+++ b/be/src/util/metrics.cpp
@@ -1,13 +1,8 @@
-// Modifications copyright (C) 2017, Baidu.com, Inc.
-// Copyright 2017 The Apache Software Foundation
+// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
 
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
@@ -19,281 +14,147 @@
 // under the License.
 
 #include "util/metrics.h"
-#include 
-#include 
-#include 
-
-#include 
-#include 
-#include 
-#include 
-
-#include "common/logging.h"
-#include "gutil/strings/substitute.h"
-#include "util/palo_metrics.h"
-#include "http/web_page_handler.h"
 
 namespace palo {
 
-template <>
-void ToJsonValue(const std::string& value, const TUnit::type unit,
-        rapidjson::Document* document, rapidjson::Value* out_val) {
-    rapidjson::Value val(value.c_str(), document->GetAllocator());
-    *out_val = val;
-}
+MetricLabels MetricLabels::EmptyLabels;
 
-void Metric::AddStandardFields(rapidjson::Document* document, rapidjson::Value* val) {
-    rapidjson::Value name(_key.c_str(), document->GetAllocator());
-    val->AddMember("name", name, document->GetAllocator());
-    rapidjson::Value desc(_description.c_str(), document->GetAllocator());
-    val->AddMember("description", desc, document->GetAllocator());
-    rapidjson::Value metric_value(ToHumanReadable().c_str(), document->GetAllocator());
-    val->AddMember("human_readable", metric_value, document->GetAllocator());
-}
-
-MetricDefs* MetricDefs::GetInstance() {
-    // Note that this is not thread-safe in C++03 (but will be in C++11 see
-    // http://stackoverflow.com/a/19907903/132034). We don't bother with the double-check
-    // locking pattern because it introduces complexity whereas a race is very unlikely
-    // and it doesn't matter if we construct two instances since MetricDefsConstants is
-    // just a constant map.
-    static MetricDefs instance;
-    return &instance;
-}
-
-TMetricDef MetricDefs::Get(const std::string& key, const std::string& arg) {
-    MetricDefs* inst = GetInstance();
-    std::map::iterator it = inst->_metric_defs.TMetricDefs.find(key);
-    if (it == inst->_metric_defs.TMetricDefs.end()) {
-        DCHECK(false) << "Could not find metric definition for key=" << key << " arg=" << arg;
-        return TMetricDef();
+std::ostream& operator<<(std::ostream& os, MetricType type) {
+    switch (type) {
+    case MetricType::COUNTER:
+        os << "COUNTER";
+        break;
+    case MetricType::GAUGE:
+        os << "GAUGE";
+        break;
+    case MetricType::HISTOGRAM:
+        os << "HISTOGRAM";
+        break;
+    case MetricType::SUMMARY:
+        os << "SUMMARY";
+        break;
+    case MetricType::UNTYPED:
+        os << "UNTYPED";
+        break;
+    default:
+        os << "UNKNOWN";
+        break;
     }
-    TMetricDef md = it->second;
-    md.__set_key(strings::Substitute(md.key, arg));
-    md.__set_description(strings::Substitute(md.description, arg));
-    return md;
+    return os;
 }
 
-MetricGroup::MetricGroup(const std::string& name)
-    : _obj_pool(new ObjectPool()), _name(name) { }
-
-Status MetricGroup::init(WebPageHandler* webserver) {
-    if (webserver != NULL) {
-        WebPageHandler::PageHandlerCallback default_callback =
-            boost::bind(boost::mem_fn(&MetricGroup::text_callback), this, _1, _2);
-        webserver->register_page("/metrics", default_callback);
-
-        WebPageHandler::PageHandlerCallback json_callback =
-            boost::bind(boost::mem_fn(&MetricGroup::json_callback), this, _1, _2);
-        webserver->register_page("/jsonmetrics", json_callback);
-    }
-
-    return Status::OK;
-}
-
-/// TODO: init, CMCompatibleCallback, TemplateCallback are for new webserver
-/*
-Status MetricGroup::init(Webserver* webserver) {
-    if (webserver != NULL) {
-        Webserver::UrlCallback default_callback =
-            bind(mem_fn(&MetricGroup::CMCompatibleCallback), this, _1, _2);
-        webserver->RegisterUrlCallback("/jsonmetrics", "legacy-metrics.tmpl",
-                default_callback, false);
-
-        Webserver::UrlCallback json_callback =
-            bind(mem_fn(&MetricGroup::TemplateCallback), this, _1, _2);
-        webserver->RegisterUrlCallback("/metrics", "metrics.tmpl", json_callback);
-    }
-
-    return Status::OK();
-}
-
-void MetricGroup::CMCompatibleCallback(const Webserver::ArgumentMap& args,
-                                       Document* document) {
-    // If the request has a 'metric' argument, search all top-level metrics for that metric
-    // only. Otherwise, return document with list of all metrics at the top level.
-    Webserver::ArgumentMap::const_iterator metric_name = args.find("metric");
-
-    lock_guard l(_lock);
-    if (metric_name != args.end()) {
-        MetricMap::const_iterator metric = _metric_map.find(metric_name->second);
-        if (metric != _metric_map.end()) {
-            metric->second->ToLegacyJson(document);
-        }
+void Metric::hide() {
+    if (_registry == nullptr) {
         return;
     }
-
-    stack groups;
-    groups.push(this);
-    do {
-        // Depth-first traversal of children to flatten all metrics, which is what was
-        // expected by CM before we introduced metric groups.
-        MetricGroup* group = groups.top();
-        for (const ChildGroupMap::value_type& child: group->_children) {
-            groups.push(child.second);
-        }
-        for (const MetricMap::value_type& m: group->_metric_map) {
-            m.second->ToLegacyJson(document);
-        }
-    } while (!groups.empty());
+    _registry->deregister_metric(this);
+    _registry = nullptr;
 }
 
-void MetricGroup::TemplateCallback(const Webserver::ArgumentMap& args,
-                                   Document* document) {
-    Webserver::ArgumentMap::const_iterator metric_group = args.find("metric_group");
-
-    lock_guard l(_lock);
-    // If no particular metric group is requested, render this metric group (and all its
-    // children).
-    if (metric_group == args.end()) {
-        Value container;
-        ToJson(true, document, &container);
-        document->AddMember("metric_group", container, document->GetAllocator());
-        return;
-    }
-
-    // Search all metric groups to find the one we're looking for. In the future, we'll
-    // change this to support path-based resolution of metric groups.
-    MetricGroup* found_group = NULL;
-    stack groups;
-    groups.push(this);
-    while (!groups.empty() && found_group == NULL) {
-        // Depth-first traversal of children to flatten all metrics, which is what was
-        // expected by CM before we introduced metric groups.
-        MetricGroup* group = groups.top();
-        groups.pop();
-        for (const ChildGroupMap::value_type& child: group->_children) {
-            if (child.first == metric_group->second) {
-                found_group = child.second;
-                break;
-            }
-            groups.push(child.second);
-        }
-    }
-    if (found_group != NULL) {
-        Value container;
-        found_group->ToJson(false, document, &container);
-        document->AddMember("metric_group", container, document->GetAllocator());
+bool MetricCollector::add_metic(const MetricLabels& labels, Metric* metric) {
+    if (empty()) {
+        _type = metric->type();
     } else {
-        Value error(Substitute("Metric group $0 not found", metric_group->second).c_str(),
-                    document->GetAllocator());
-        document->AddMember("error", error, document->GetAllocator());
-    }
-}
-*/
-
-void MetricGroup::ToJson(bool include_children, rapidjson::Document* document, rapidjson::Value* out_val) {
-    rapidjson::Value metric_list(rapidjson::kArrayType);
-    for (const MetricMap::value_type& m: _metric_map) {
-        rapidjson::Value metric_value;
-        m.second->ToJson(document, &metric_value);
-        metric_list.PushBack(metric_value, document->GetAllocator());
-    }
-
-    rapidjson::Value container(rapidjson::kObjectType);
-    container.AddMember("metrics", metric_list, document->GetAllocator());
-    container.AddMember("name", 
-        rapidjson::Value(_name.c_str(), document->GetAllocator()).Move(), document->GetAllocator());
-    if (include_children) {
-        rapidjson::Value child_groups(rapidjson::kArrayType);
-        for (const ChildGroupMap::value_type& child: _children) {
-            rapidjson::Value child_value;
-            child.second->ToJson(true, document, &child_value);
-            child_groups.PushBack(child_value, document->GetAllocator());
+        if (metric->type() != _type) {
+            return false;
         }
-        container.AddMember("child_groups", child_groups, document->GetAllocator());
     }
-
-    *out_val = container;
+    auto it = _metrics.emplace(labels, metric);
+    return it.second;
 }
 
-MetricGroup* MetricGroup::GetOrCreateChildGroup(const std::string& name) {
-    std::lock_guard l(_lock);
-    ChildGroupMap::iterator it = _children.find(name);
-    if (it != _children.end()) return it->second;
-    MetricGroup* group = _obj_pool->add(new MetricGroup(name));
-    _children[name] = group;
-    return group;
-}
-
-MetricGroup* MetricGroup::FindChildGroup(const std::string& name) {
-    std::lock_guard l(_lock);
-    ChildGroupMap::iterator it = _children.find(name);
-    if (it != _children.end()) return it->second;
-    return NULL;
-}
-
-///TODO: debug string is for new web server
-/*
-std::string MetricGroup::debug_string() {
-    Webserver::ArgumentMap empty_map;
-    rapidjson::Document document;
-    document.SetObject();
-    TemplateCallback(empty_map, &document);
-    StringBuffer strbuf;
-    PrettyWriter writer(strbuf);
-    document.Accept(writer);
-    return strbuf.GetString();
-}
-*/
-
-TMetricDef MakeTMetricDef(const std::string& key, TMetricKind::type kind,
-                                  TUnit::type unit) {
-    TMetricDef ret;
-    ret.__set_key(key);
-    ret.__set_kind(kind);
-    ret.__set_units(unit);
-    return ret;
-}
-
-void MetricGroup::print_metric_map(std::stringstream* output) {
-    std::lock_guard l(_lock);
-    BOOST_FOREACH(const MetricMap::value_type & m, _metric_map) {
-        m.second->print(output);
-        (*output) << std::endl;
+void MetricCollector::remove_metric(Metric* metric) {
+    for (auto& it : _metrics) {
+        if (it.second == metric) {
+            _metrics.erase(it.first);
+            break;
+        }
     }
 }
 
-void MetricGroup::print_metric_map_as_json(std::vector* metrics) {
-    std::lock_guard l(_lock);
-    BOOST_FOREACH(const MetricMap::value_type & m, _metric_map) {
-        std::stringstream ss;
-        m.second->print_json(&ss);
-        metrics->push_back(ss.str());
+Metric* MetricCollector::get_metric(const MetricLabels& labels) const {
+    auto it = _metrics.find(labels);
+    if (it != std::end(_metrics)) {
+        return it->second;
+    }
+    return nullptr;
+}
+
+void MetricCollector::get_metrics(std::vector* metrics) {
+    for (auto& it : _metrics) {
+        metrics->push_back(it.second);
     }
 }
 
-std::string MetricGroup::debug_string() {
-    std::stringstream ss;
-    WebPageHandler::ArgumentMap empty_map;
-    text_callback(empty_map, &ss);
-    return ss.str();
+MetricRegistry::~MetricRegistry() {
+    {
+        std::lock_guard l(_lock);
+
+        std::vector metrics;
+        for (auto& it : _collectors) {
+            it.second->get_metrics(&metrics);
+        }
+        for (auto metric : metrics) {
+            _deregister_locked(metric);
+        }
+    }
+    // All register metric will deregister
+    DCHECK(_collectors.empty()) << "_collectors not empty, size=" << _collectors.size();
 }
 
-std::string MetricGroup::debug_string_json() {
-    std::stringstream ss;
-    WebPageHandler::ArgumentMap empty_map;
-    json_callback(empty_map, &ss);
-    return ss.str();
+bool MetricRegistry::register_metric(const std::string& name,
+                                     const MetricLabels& labels,
+                                     Metric* metric) {
+    metric->hide();
+    std::lock_guard l(_lock);
+    MetricCollector* collector = nullptr;
+    auto it = _collectors.find(name);
+    if (it == std::end(_collectors)) {
+        collector = new MetricCollector();
+        _collectors.emplace(name, collector);
+    } else {
+        collector = it->second;
+    }
+    auto res = collector->add_metic(labels, metric);
+    if (res) {
+        metric->_registry = this;
+    }
+    return res;
 }
 
-void MetricGroup::text_callback(const WebPageHandler::ArgumentMap& args, std::stringstream* output) {
-    (*output) << "
";
-    print_metric_map(output);
-    (*output) << "
"; +void MetricRegistry::_deregister_locked(Metric* metric) { + std::vector to_erase; + for (auto& it : _collectors) { + it.second->remove_metric(metric); + if (it.second->empty()) { + to_erase.emplace_back(it.first); + } + } + for (auto& name : to_erase) { + auto it = _collectors.find(name); + delete it->second; + _collectors.erase(it); + } } -void MetricGroup::json_callback(const WebPageHandler::ArgumentMap& args, std::stringstream* output) { - (*output) << "{"; - std::vector metrics; - print_metric_map_as_json(&metrics); - (*output) << boost::join(metrics, ",\n"); - (*output) << "}"; +Metric* MetricRegistry::get_metric(const std::string& name, const MetricLabels& labels) const { + std::lock_guard l(_lock); + auto it = _collectors.find(name); + if (it != std::end(_collectors)) { + return it->second->get_metric(labels); + } + return nullptr; } -template<> void print_primitive_as_json(const std::string& v, - std::stringstream* out) { - (*out) << "\"" << v << "\""; +bool MetricRegistry::register_hook(const std::string& name, const std::function& hook) { + std::lock_guard l(_lock); + auto it = _hooks.emplace(name, hook); + return it.second; +} + +void MetricRegistry::deregister_hook(const std::string& name) { + std::lock_guard l(_lock); + _hooks.erase(name); } } diff --git a/be/src/util/metrics.h b/be/src/util/metrics.h index af73dff9c9..7684408e6c 100644 --- a/be/src/util/metrics.h +++ b/be/src/util/metrics.h @@ -1,13 +1,8 @@ -// Modifications copyright (C) 2017, Baidu.com, Inc. -// Copyright 2017 The Apache Software Foundation +// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // @@ -18,448 +13,317 @@ // specific language governing permissions and limitations // under the License. -#ifndef BDG_PALO_BE_SRC_COMMON_UTIL_METRICS_H -#define BDG_PALO_BE_SRC_COMMON_UTIL_METRICS_H +#pragma once -#include -#include -#include -#include -#include +#include +#include +#include +#include #include +#include +#include +#include -#include -#include -#include - -#include "common/logging.h" -#include "common/status.h" -#include "common/object_pool.h" -#include "gen_cpp/MetricDefs_types.h" -#include "gen_cpp/MetricDefs_constants.h" -#include "http/web_page_handler.h" -#include "util/debug_util.h" -#include "util/json_util.h" -#include "util/pretty_printer.h" +#include "util/spinlock.h" +#include "util/core_local.h" namespace palo { -// Helper method to print a single primitive value as a Json atom -template void print_primitive_as_json(const T& v, std::stringstream* out) { - (*out) << v; -} +class MetricRegistry; -// Specialisation to print string values inside quotes when writing to Json -template<> void print_primitive_as_json(const std::string& v, - std::stringstream* out); - -/// Singleton that provides metric definitions. Metrics are defined in metrics.json -/// and generate_metrics.py produces MetricDefs.thrift. This singleton wraps an instance -/// of the thrift definitions. -class MetricDefs { -public: - /// Gets the TMetricDef for the metric key. 'arg' is an optional argument to the - /// TMetricDef for metrics defined by a format string. The key must exist or a DCHECK - /// will fail. - /// TODO: Support multiple arguments. - static TMetricDef Get(const std::string& key, const std::string& arg = ""); - -private: - friend class MetricsTest; - - /// Gets the MetricDefs singleton. - static MetricDefs* GetInstance(); - - /// Contains the map of all TMetricDefs, non-const for testing - //typedef std::map MetricDefsConstants; - MetricDefsConstants _metric_defs; - - MetricDefs() { } - DISALLOW_COPY_AND_ASSIGN(MetricDefs); +enum class MetricType { + COUNTER, + GAUGE, + HISTOGRAM, + SUMMARY, + UNTYPED }; -/// A metric is a container for some value, identified by a string key. Most metrics are -/// numeric, but this metric base-class is general enough such that metrics may be lists, -/// maps, histograms or other arbitrary structures. -// -/// Metrics must be able to convert themselves to JSON (for integration with our monitoring -/// tools, and for rendering in webpages). See ToJson(), and also ToLegacyJson() which -/// ensures backwards compatibility with older versions of CM. -// -/// Metrics should be supplied with a description, which is included in JSON output for -/// display by monitoring systems / Impala's webpages. -// -/// TODO: Add ToThrift() for conversion to an RPC-friendly format. -//template -//class Metric : public GenericMetric { +std::ostream& operator<<(std::ostream& os, MetricType type); + class Metric { public: - /// Empty virtual destructor - virtual ~Metric() {} + Metric(MetricType type) :_type(type), _registry(nullptr) { } + virtual ~Metric() { hide(); } + MetricType type() const { return _type; } + void hide(); +private: + friend class MetricRegistry; - /// Builds a new Value into 'val', using (if required) the allocator from - /// 'document'. Should set the following fields where appropriate: - // - /// name, value, human_readable, description - virtual void ToJson(rapidjson::Document* document, rapidjson::Value* val) = 0; - - /// Adds a new json value directly to 'document' of the form: - /// "name" : "human-readable-string" - // - /// This method is kept for backwards-compatibility with CM5.0. - virtual void ToLegacyJson(rapidjson::Document* document) = 0; - - /// Writes a human-readable representation of this metric to 'out'. This is the - /// representation that is often displayed in webpages etc. - virtual std::string ToHumanReadable() = 0; - - const std::string& key() const { return _key; } - const std::string& description() const { return _description; } - - virtual void print(std::stringstream* out) { - std::lock_guard l(_lock); - (*out) << _key << ":"; - print_value(out); - } - - virtual void print_json(std::stringstream* out) { - std::lock_guard l(_lock); - (*out) << "\"" << _key << "\": "; - print_value_json(out); - } - -protected: - // Subclasses are required to implement this to print a string - // representation of the metric to the supplied stringstream. - // Both methods are always called with _lock taken, so implementations must - // not try and take _lock themselves.. - virtual void print_value(std::stringstream* out) = 0; - virtual void print_value_json(std::stringstream* out) = 0; - - // Unique key identifying this metric - const std::string _key; - - /// Description of this metric. - /// TODO: share one copy amongst metrics with the same description. - const std::string _description; - - friend class MetricGroup; - - Metric(const TMetricDef& def) : _key(def.key), _description(def.description) { } - - /// Convenience method to add standard fields (name, description, human readable string) - /// to 'val'. - void AddStandardFields(rapidjson::Document* document, rapidjson::Value* val); - - // Guards access to value - std::mutex _lock; + MetricType _type; + MetricRegistry* _registry; }; -/// A SimpleMetric has a value which is a simple primitive type: e.g. integers, strings and -/// floats. It is parameterised not only by the type of its value, but by both the unit -/// (e.g. bytes/s), drawn from TUnit and the 'kind' of the metric itself. The kind -/// can be one of: 'gauge', which may increase or decrease over time, a 'counter' which is -/// increasing only over time, or a 'property' which is not numeric. -// -/// SimpleMetrics return their current value through the value() method. Access to value() -/// is thread-safe. -// -/// TODO: We can use type traits to select a more efficient lock-free implementation of -/// value() etc. where it is safe to do so. -/// TODO: CalculateValue() can be returning a value, its current interface is not clean. -//template -//class PrimitiveMetric : public Metric { - -template class SimpleMetric : public Metric { public: - SimpleMetric(const TMetricDef& metric_def, const T& initial_value) - : Metric(metric_def), _unit(metric_def.units), _value(initial_value) { - DCHECK_EQ(metric_kind, metric_def.kind) << "Metric kind does not match definition: " - << metric_def.key; - } - + SimpleMetric(MetricType type) :Metric(type) { } virtual ~SimpleMetric() { } + virtual std::string to_string() const = 0; +}; - /// Returns the current value, updating it if necessary. Thread-safe. - T value() { +// Metric that only can increment +template +class LockSimpleMetric : public SimpleMetric { +public: + LockSimpleMetric(MetricType type) :SimpleMetric(type), _value(T()) { } + virtual ~LockSimpleMetric() { } + + std::string to_string() const override { + std::stringstream ss; + ss << value(); + return ss.str(); + } + + T value() const { std::lock_guard l(_lock); - CalculateValue(); return _value; } - /// Sets the current value. Thread-safe. - void set_value(const T& value) { - std::lock_guard l(_lock); - _value = value; - } - - /// Adds 'delta' to the current value atomically. void increment(const T& delta) { - DCHECK(kind() != TMetricKind::PROPERTY) - << "Can't change value of PROPERTY metric: " << key(); - DCHECK(kind() != TMetricKind::COUNTER || delta >= 0) - << "Can't decrement value of COUNTER metric: " << key(); - if (delta == 0) return; - std::lock_guard l(_lock); - _value += delta; + std::lock_guard l(this->_lock); + this->_value += delta; } - - // Sets current metric value to parameter - virtual void update(const T& value) { - std::lock_guard l(_lock); - _value = value; + void set_value(const T& value) { + std::lock_guard l(this->_lock); + this->_value = value; } - - virtual void ToJson(rapidjson::Document* document, rapidjson::Value* val) { - rapidjson::Value container(rapidjson::kObjectType); - AddStandardFields(document, &container); - - rapidjson::Value metric_value; - ToJsonValue(value(), TUnit::NONE, document, &metric_value); - container.AddMember("value", metric_value, document->GetAllocator()); - - rapidjson::Value type_value(PrintTMetricKind(kind()).c_str(), - document->GetAllocator()); - container.AddMember("kind", type_value, document->GetAllocator()); - rapidjson::Value units(PrintTUnit(unit()).c_str(), document->GetAllocator()); - container.AddMember("units", units, document->GetAllocator()); - *val = container; - } - - virtual std::string ToHumanReadable() { - return PrettyPrinter::print(value(), unit()); - } - - virtual void ToLegacyJson(rapidjson::Document* document) { - rapidjson::Value val; - ToJsonValue(value(), TUnit::NONE, document, &val); - rapidjson::Value temp(_key.c_str(), document->GetAllocator()); - document->AddMember(temp, val, document->GetAllocator()); - } - - TUnit::type unit() const { return _unit; } - TMetricKind::type kind() const { return metric_kind; } - protected: - /// Called to compute value_ if necessary during calls to value(). The more natural - /// approach would be to have virtual T value(), but that's not possible in C++. - // - /// TODO: Should be cheap to have a blank implementation, but if required we can cause - /// the compiler to avoid calling this entirely through a compile-time constant. - virtual void CalculateValue() { } - - /// Units of this metric. - const TUnit::type _unit; - - /// Guards access to value_. - SpinLock _lock; - - /// The current value of the metric + // We use spinlock instead of std::atomic is because atomic don't support + // double's fetch_add + // TODO(zc): If this is atomic is bottleneck, we change to thread local. + // performance: on Intel(R) Xeon(R) CPU E5-2450 int64_t + // original type: 2ns/op + // single thread spinlock: 26ns/op + // multiple thread(8) spinlock: 2500ns/op + mutable SpinLock _lock; T _value; - - virtual void print_value(std::stringstream* out) { - (*out) << this->_value; - } - - virtual void print_value_json(std::stringstream* out) { - print_primitive_as_json(this->_value, out); - } }; -// Gauge metric that computes the sum of several gauges. -template -class SumGauge : public SimpleMetric { +template +class CoreLocalCouter : public SimpleMetric { public: - SumGauge(const TMetricDef& metric_def, - const std::vector*>& metrics) - : SimpleMetric(metric_def, 0), _metrics(metrics) {} - virtual ~SumGauge() {} + CoreLocalCouter() :SimpleMetric(MetricType::COUNTER), _value() { } + virtual ~CoreLocalCouter() { } -private: - virtual void CalculateValue() override { + std::string to_string() const override { + std::stringstream ss; + ss << value(); + return ss.str(); + } + + T value() const { T sum = 0; - for (SimpleMetric* metric : _metrics) sum += metric->value(); - this->_value = sum; - } - - /// The metrics to be summed. - std::vector*> _metrics; -}; - -/// Container for a set of metrics. A MetricGroup owns the memory for every metric -/// contained within it (see Add*() to create commonly used metric -/// types). Metrics are 'registered' with a MetricGroup, once registered they cannot be -/// deleted. -// -/// MetricGroups may be organised hierarchically as a tree. -// -/// Typically a metric object is cached by its creator after registration. If a metric -/// must be retrieved without an available pointer, FindMetricForTesting() will search the -/// MetricGroup and all its descendent MetricGroups in turn. -// -/// TODO: Hierarchical naming: that is, resolve "group1.group2.metric-name" to a path -/// through the metric tree. -class MetricGroup { -public: - MetricGroup(const std::string& name); - - // Registers a new metric. Ownership of the metric will be transferred to this - // Metrics object, so callers should take care not to destroy the Metric they - // pass in. - // If a metric already exists with the supplied metric's key, it is replaced. - // The template parameter M must be a subclass of Metric. - template - M* register_metric(M* metric) { - DCHECK(!metric->_key.empty()); - M* mt = _obj_pool->add(metric); - - std::lock_guard l(_lock); - DCHECK(_metric_map.find(metric->_key) == _metric_map.end()); - _metric_map[metric->_key] = mt; - return mt; - } - - /// Create a gauge metric object with given key and initial value (owned by this object) - template - SimpleMetric* AddGauge(const std::string& key, const T& value, - const std::string& metric_def_arg = "") { - return register_metric(new SimpleMetric( - MetricDefs::Get(key, metric_def_arg), value)); - } - - template - SimpleMetric* AddProperty(const std::string& key, - const T& value, const std::string& metric_def_arg = "") { - return register_metric(new SimpleMetric( - MetricDefs::Get(key, metric_def_arg), value)); - } - - template - SimpleMetric* AddCounter(const std::string& key, - const T& value, const std::string& metric_def_arg = "") { - return register_metric(new SimpleMetric( - MetricDefs::Get(key, metric_def_arg), value)); - } - - /// Returns a metric by key. All MetricGroups reachable from this group are searched in - /// depth-first order, starting with the root group. Returns NULL if there is no metric - /// with that key. This is not a very cheap operation; the result should be cached where - /// possible. - // - /// Used for testing only. - template - M* FindMetricForTesting(const std::string& key) { - std::stack groups; - groups.push(this); - std::lock_guard l(_lock); - do { - MetricGroup* group = groups.top(); - groups.pop(); - MetricMap::const_iterator it = group->_metric_map.find(key); - if (it != group->_metric_map.end()) return reinterpret_cast(it->second); - for (const ChildGroupMap::value_type& child: group->_children) { - groups.push(child.second); - } - } while (!groups.empty()); - return NULL; - } - - // Returns a metric by key. Returns NULL if there is no metric with that - // key. This is not a very cheap operation and should not be called in a loop. - // If the metric needs to be updated in a loop, the returned metric should be cached. - template - M* get_metric(const std::string& key) { - std::lock_guard l(_lock); - MetricMap::iterator it = _metric_map.find(key); - - if (it == _metric_map.end()) { - return NULL; + for (int i = 0; i < _value.size(); ++i) { + sum += *_value.access_at_core(i); } - - return reinterpret_cast(it->second); + return sum; } - // Register page callbacks with the webserver - Status init(WebPageHandler* webserver); - - /// Converts this metric group (and optionally all of its children recursively) to JSON. - void ToJson(bool include_children, rapidjson::Document* document, - rapidjson::Value* out_val); - - /// Creates or returns an already existing child metric group. - MetricGroup* GetOrCreateChildGroup(const std::string& name); - - /// Returns a child metric group with name 'name', or NULL if that group doesn't exist - MetricGroup* FindChildGroup(const std::string& name); - - // Useful for debuggers, returns the output of text_callback - std::string debug_string(); - - // Same as above, but for Json output - std::string debug_string_json(); - - const std::string& name() const { return _name; } - -private: - // Pool containing all metric objects - boost::scoped_ptr _obj_pool; - - /// Name of this metric group. - std::string _name; - - // Guards _metric_map - //std::mutex _lock; - SpinLock _lock; - - // Contains all Metric objects, indexed by key - typedef std::map MetricMap; - MetricMap _metric_map; - - /// All child metric groups - typedef std::map ChildGroupMap; - ChildGroupMap _children; - - /// Webserver callback for /metrics. Produces a tree of JSON values, each representing a - /// metric group, and each including a list of metrics, and a list of immediate - /// children. If args contains a paramater 'metric', only the json for that metric is - /// returned. - /// TODO: new webserver for runtime profile - //void TemplateCallback(const Webserver::ArgumentMap& args, - // rapidjson::Document* document); - - /// Legacy webpage callback for CM 5.0 and earlier. Produces a flattened map of (key, - /// value) pairs for all metrics in this hierarchy. - /// If args contains a paramater 'metric', only the json for that metric is returned. - /// TODO: new webserver for runtime profile - //void CMCompatibleCallback(const Webserver::ArgumentMap& args, - // rapidjson::Document* document); - - // Writes _metric_map as a list of key : value pairs - void print_metric_map(std::stringstream* output); - - // Builds a list of metrics as Json-style "key": "value" pairs - void print_metric_map_as_json(std::vector* metrics); - - // Webserver callback (on /metrics), renders metrics as single text page - void text_callback(const WebPageHandler::ArgumentMap& args, std::stringstream* output); - - // Webserver callback (on /jsonmetrics), renders metrics as a single json document - void json_callback(const WebPageHandler::ArgumentMap& args, std::stringstream* output); - + void increment(const T& delta) { + __sync_fetch_and_add(_value.access(), delta); + } +protected: + CoreLocalValue _value; }; -typedef class SimpleMetric IntGauge; -typedef class SimpleMetric UIntGauge; -typedef class SimpleMetric DoubleGauge; -typedef class SimpleMetric IntCounter; +template +class LockCounter : public LockSimpleMetric { +public: + LockCounter() :LockSimpleMetric(MetricType::COUNTER) { } + virtual ~LockCounter() { } +}; -typedef class SimpleMetric BooleanProperty; -typedef class SimpleMetric StringProperty; +// This can only used for trival type +template +class LockGauge : public LockSimpleMetric { +public: + LockGauge() :LockSimpleMetric(MetricType::GAUGE) { } + virtual ~LockGauge() { } +}; -TMetricDef MakeTMetricDef(const std::string& key, TMetricKind::type kind, - TUnit::type unit); +// one key-value pair used to +struct MetricLabel { + std::string name; + std::string value; -} //namespace palo + MetricLabel() { } -#endif // BDG_PALO_BE_SRC_COMMON_UTIL_METRICS_H + template + MetricLabel(const T& name_, const P& value_) :name(name_), value(value_) { + } + + bool operator==(const MetricLabel& other) const { + return name == other.name && value == other.value; + } + bool operator!=(const MetricLabel& other) const { + return !(*this == other); + } + bool operator<(const MetricLabel& other) const { + auto res = name.compare(other.name); + if (res == 0) { + return value < other.value; + } + return res < 0; + } + int compare(const MetricLabel& other) const { + auto res = name.compare(other.name); + if (res == 0) { + return value.compare(other.value); + } + return res; + } + std::string to_string() const { + return name + "=" + value; + } +}; + +struct MetricLabels { + static MetricLabels EmptyLabels; + // used std::set to sort MetricLabel so that we can get compare two MetricLabels + std::set labels; + + MetricLabels& add(const std::string& name, const std::string& value) { + labels.emplace(name, value); + return *this; + } + + bool operator==(const MetricLabels& other) const { + if (labels.size() != other.labels.size()) { + return false; + } + auto it = std::begin(labels); + auto other_it = std::begin(other.labels); + while (it != std::end(labels)) { + if (*it != *other_it) { + return false; + } + ++it; + ++other_it; + } + return true; + } + bool operator<(const MetricLabels& other) const { + auto it = std::begin(labels); + auto other_it = std::begin(other.labels); + while (it != std::end(labels) && other_it != std::end(other.labels)) { + auto res = it->compare(*other_it); + if (res < 0) { + return true; + } else if (res > 0) { + return false; + } + ++it; + ++other_it; + } + if (it == std::end(labels)) { + if (other_it == std::end(other.labels)) { + return false; + } + return true; + } else { + return false; + } + } + bool empty() const { + return labels.empty(); + } + + std::string to_string() const { + std::stringstream ss; + int i = 0; + for (auto& label : labels) { + if (i++ > 0) { + ss << ","; + } + ss << label.to_string(); + } + return ss.str(); + } +}; + +class MetricCollector; + +class MetricsVisitor { +public: + virtual ~MetricsVisitor() { } + + // visit a collector, you can implement collector visitor, or only implement + // metric visitor + virtual void visit(const std::string& prefix, const std::string& name, + MetricCollector* collector) = 0; +}; + +class MetricCollector { +public: + bool add_metic(const MetricLabels& labels, Metric* metric); + void remove_metric(Metric* metric); + void collect(const std::string& prefix, const std::string& name, MetricsVisitor* visitor) { + visitor->visit(prefix, name, this); + } + bool empty() const { + return _metrics.empty(); + } + Metric* get_metric(const MetricLabels& labels) const; + // get all metrics belong to this collector + void get_metrics(std::vector* metrics); + + const std::map& metrics() const { + return _metrics; + } + MetricType type() const { return _type; } +private: + MetricType _type = MetricType::UNTYPED; + std::map _metrics; +}; + +class MetricRegistry { +public: + MetricRegistry(const std::string& name) : _name(name) { } + ~MetricRegistry(); + bool register_metric(const std::string& name, Metric* metric) { + return register_metric(name, MetricLabels::EmptyLabels, metric); + } + bool register_metric(const std::string& name, const MetricLabels& labels, Metric* metric); + // Now this function is not used frequently, so this is a little time consuming + void deregister_metric(Metric* metric) { + std::lock_guard l(_lock); + _deregister_locked(metric); + } + Metric* get_metric(const std::string& name) const { + return get_metric(name, MetricLabels::EmptyLabels); + } + Metric* get_metric(const std::string& name, const MetricLabels& labels) const; + + // Register a hook, this hook will called before collect is called + bool register_hook(const std::string& name, const std::function& hook); + void deregister_hook(const std::string& name); + + void collect(MetricsVisitor* visitor) { + std::lock_guard l(_lock); + // Before we collect, need to call hooks + for (auto& it : _hooks) { + it.second(); + } + for (auto& it : _collectors) { + it.second->collect(_name, it.first, visitor); + } + } +private: + void _deregister_locked(Metric* metric); + + const std::string _name; + + mutable SpinLock _lock; + std::map _collectors; + std::map> _hooks; +}; + +using IntCounter = CoreLocalCouter; +using IntLockCounter = LockCounter; +using UIntCounter = CoreLocalCouter; +using DoubleCounter = LockCounter; +using IntGauge = LockGauge; +using UIntGauge = LockGauge; +using DoubleGauge = LockGauge; + +} diff --git a/be/src/util/network_util.cpp b/be/src/util/network_util.cpp index fae915390f..351588ae53 100644 --- a/be/src/util/network_util.cpp +++ b/be/src/util/network_util.cpp @@ -160,4 +160,29 @@ TNetworkAddress make_network_address(const std::string& hostname, int port) { return ret; } +Status get_inet_interfaces(std::vector* interfaces, bool include_ipv6) { + ifaddrs* if_addrs = nullptr; + if (getifaddrs(&if_addrs)) { + std::stringstream ss; + char buf[64]; + ss << "getifaddrs failed, errno:" << errno + << ", message" << strerror_r(errno, buf, sizeof(buf)); + return Status(ss.str()); + } + + for (ifaddrs* if_addr = if_addrs; if_addr != nullptr; if_addr = if_addr->ifa_next) { + if (if_addr->ifa_addr == nullptr || if_addr->ifa_name == nullptr) { + continue; + } + if (if_addr->ifa_addr->sa_family == AF_INET || + (include_ipv6 && if_addr->ifa_addr->sa_family == AF_INET6)) { + interfaces->emplace_back(if_addr->ifa_name); + } + } + if (if_addrs != nullptr) { + freeifaddrs(if_addrs); + } + return Status::OK; +} + } diff --git a/be/src/util/network_util.h b/be/src/util/network_util.h index c069deb2ac..4e734b4069 100644 --- a/be/src/util/network_util.h +++ b/be/src/util/network_util.h @@ -56,6 +56,8 @@ Status get_hosts_v4(std::vector* hosts); // Utility method because Thrift does not supply useful constructors TNetworkAddress make_network_address(const std::string& hostname, int port); +Status get_inet_interfaces(std::vector* interfaces, bool include_ipv6 = false); + } #endif diff --git a/be/src/util/os_info.cpp b/be/src/util/os_info.cpp new file mode 100755 index 0000000000..437eff3dd7 --- /dev/null +++ b/be/src/util/os_info.cpp @@ -0,0 +1,87 @@ +// Modifications copyright (C) 2017, Baidu.com, Inc. +// Copyright 2017 The Apache Software Foundation + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "util/os_info.h" + +#include +#include +#include +#include +#include + +#include + +#include "common/names.h" + +namespace palo { + +bool OsInfo::initialized_ = false; +string OsInfo::os_version_ = "Unknown"; +clockid_t OsInfo::fast_clock_ = CLOCK_MONOTONIC; +std::string OsInfo::clock_name_ = + "Unknown clocksource, clockid_t defaulting to CLOCK_MONOTONIC"; + +// CLOCK_MONOTONIC_COARSE was added in Linux 2.6.32. For now we still want to support +// older kernels by falling back to CLOCK_MONOTONIC. +#ifdef CLOCK_MONOTONIC_COARSE +#define HAVE_CLOCK_MONOTONIC_COARSE true +#else +#define HAVE_CLOCK_MONOTONIC_COARSE false +#define CLOCK_MONOTONIC_COARSE CLOCK_MONOTONIC +#endif + +void OsInfo::Init() { + DCHECK(!initialized_); + // Read from /proc/version + ifstream version("/proc/version", ios::in); + if (version.good()) getline(version, os_version_); + if (version.is_open()) version.close(); + + // Read the current clocksource to see if CLOCK_MONOTONIC is known to be fast. "tsc" is + // fast, while "xen" is slow (40 times slower than "tsc" on EC2). If CLOCK_MONOTONIC is + // known to be slow, we use CLOCK_MONOTONIC_COARSE, which uses jiffies, with a + // resolution measured in milliseconds, rather than nanoseconds. + std::ifstream clocksource_file( + "/sys/devices/system/clocksource/clocksource0/current_clocksource"); + if (clocksource_file.good()) { + std::string clocksource; + clocksource_file >> clocksource; + clock_name_ = "clocksource: '" + clocksource + "', clockid_t: "; + if (HAVE_CLOCK_MONOTONIC_COARSE && clocksource != "tsc") { + clock_name_ += "CLOCK_MONOTONIC_COARSE"; + fast_clock_ = CLOCK_MONOTONIC_COARSE; + } else { + clock_name_ += "CLOCK_MONOTONIC"; + fast_clock_ = CLOCK_MONOTONIC; + } + } + + initialized_ = true; +} + +string OsInfo::DebugString() { + DCHECK(initialized_); + stringstream stream; + stream << "OS version: " << os_version_ << endl + << "Clock: " << clock_name_ << endl; + return stream.str(); +} + +} diff --git a/be/src/util/os_info.h b/be/src/util/os_info.h new file mode 100755 index 0000000000..fdfc7f4563 --- /dev/null +++ b/be/src/util/os_info.h @@ -0,0 +1,60 @@ +// Modifications copyright (C) 2017, Baidu.com, Inc. +// Copyright 2017 The Apache Software Foundation + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef BDG_PALO_BE_UTIL_OS_INFO_H +#define BDG_PALO_BE_UTIL_OS_INFO_H + +#include + +#include + +#include "common/logging.h" + +namespace palo { + +/// Provides information about the OS we're running on. +class OsInfo { + public: + /// Initialize OsInfo. + static void Init(); + + static const std::string os_version() { + DCHECK(initialized_); + return os_version_; + } + + /// Return CLOCK_MONOTONIC if it's fast. Otherwise CLOCK_MONOTONIC_COARSE, which will be + /// fast but lower resolution. + static clockid_t fast_clock() { + DCHECK(initialized_); + return fast_clock_; + } + + static std::string DebugString(); + + private: + static bool initialized_; + static std::string os_version_; + static clockid_t fast_clock_; + static std::string clock_name_; +}; + +} +#endif diff --git a/be/src/util/palo_metrics.cpp b/be/src/util/palo_metrics.cpp index 8d34c02280..6aa31344f4 100644 --- a/be/src/util/palo_metrics.cpp +++ b/be/src/util/palo_metrics.cpp @@ -1,13 +1,8 @@ -// Modifications copyright (C) 2017, Baidu.com, Inc. -// Copyright 2017 The Apache Software Foundation +// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // @@ -21,138 +16,113 @@ #include "util/palo_metrics.h" #include "util/debug_util.h" +#include "util/system_metrics.h" namespace palo { -// Naming convention: Components should be separated by '.' and words should -// be separated by '-'. -const char* PALO_BE_START_TIME = "palo_be.start_time"; -const char* PALO_BE_VERSION = "palo_be.version"; -const char* PALO_BE_READY = "palo_be.ready"; -const char* PALO_BE_NUM_FRAGMENTS = "palo_be.num_fragments"; -const char* TOTAL_SCAN_RANGES_PROCESSED = "palo_be.scan_ranges.total"; -const char* NUM_SCAN_RANGES_MISSING_VOLUME_ID = "palo_be.scan_ranges.num_missing_volume_id"; -const char* MEM_POOL_TOTAL_BYTES = "palo_be.mem_pool.total_bytes"; -const char* HASH_TABLE_TOTAL_BYTES = "palo_be.hash_table.total_bytes"; -const char* OLAP_LRU_CACHE_LOOKUP_COUNT = "palo_be.olap.lru_cache.lookup_count"; -const char* OLAP_LRU_CACHE_HIT_COUNT = "palo_be.olap.lru_cache.hit_count"; -const char* PALO_PUSH_COUNT = "palo_be.olap.push_count"; -const char* PALO_FETCH_COUNT = "palo_be.olap.fetch_count"; -const char* PALO_REQUEST_COUNT = "palo_be.olap.request_count"; -const char* BE_MERGE_DELTA_NUM = "palo_be.olap.be_merge.delta_num"; -const char* BE_MERGE_SIZE = "palo_be.olap.be_merge_size"; -const char* CE_MERGE_DELTA_NUM = "palo_be.olap.ce_merge.delta_num"; -const char* CE_MERGE_SIZE = "palo_be.olap.ce_merge_size"; +PaloMetrics PaloMetrics::_s_palo_metrics; -const char* IO_MGR_NUM_BUFFERS = "palo_be.io_mgr.num_buffers"; -const char* IO_MGR_NUM_OPEN_FILES = "palo_be.io_mgr.num_open_files"; -const char* IO_MGR_NUM_UNUSED_BUFFERS = "palo_be.io_mgr.num_unused_buffers"; -// const char* IO_MGR_NUM_CACHED_FILE_HANDLES = "palo_be.io_mgr_num_cached_file_handles"; -const char* IO_MGR_NUM_FILE_HANDLES_OUTSTANDING = "palo_be.io_mgr.num_file_handles_outstanding"; -// const char* IO_MGR_CACHED_FILE_HANDLES_HIT_COUNT = "palo_be.io_mgr_cached_file_handles_hit_count"; -// const char* IO_MGR_CACHED_FILE_HANDLES_MISS_COUNT = "palo_be.io_mgr_cached_file_handles_miss_count"; -const char* IO_MGR_TOTAL_BYTES = "palo_be.io_mgr.total_bytes"; +// counters +IntCounter PaloMetrics::fragment_requests_total; +IntCounter PaloMetrics::fragment_request_duration_us; +IntCounter PaloMetrics::http_requests_total; +IntCounter PaloMetrics::http_request_duration_us; +IntCounter PaloMetrics::http_request_send_bytes; +IntCounter PaloMetrics::query_scan_bytes; +IntCounter PaloMetrics::query_scan_rows; +IntCounter PaloMetrics::ranges_processed_total; +IntCounter PaloMetrics::push_requests_success_total; +IntCounter PaloMetrics::push_requests_fail_total; +IntCounter PaloMetrics::push_request_duration_us; +IntCounter PaloMetrics::push_request_write_bytes; +IntCounter PaloMetrics::push_request_write_rows; +IntCounter PaloMetrics::create_tablet_requests_total; +IntCounter PaloMetrics::drop_tablet_requests_total; +IntCounter PaloMetrics::report_all_tablets_requests_total; +IntCounter PaloMetrics::report_tablet_requests_total; +IntCounter PaloMetrics::schema_change_requests_total; +IntCounter PaloMetrics::create_rollup_requests_total; +IntCounter PaloMetrics::storage_migrate_requests_total; +IntCounter PaloMetrics::delete_requests_total; +IntCounter PaloMetrics::cancel_delete_requests_total; +IntCounter PaloMetrics::base_compaction_deltas_total; +IntCounter PaloMetrics::base_compaction_bytes_total; +IntCounter PaloMetrics::cumulative_compaction_deltas_total; +IntCounter PaloMetrics::cumulative_compaction_bytes_total; -// const char* IO_MGR_BYTES_READ = "palo_be.io_mgr_bytes_read"; -// const char* IO_MGR_LOCAL_BYTES_READ = "palo_be.io_mgr_local_bytes_read"; -// const char* IO_MGR_CACHED_BYTES_READ = "palo_be.io_mgr_cached_bytes_read"; -// const char* IO_MGR_SHORT_CIRCUIT_BYTES_READ = "palo_be.io_mgr_short_circuit_bytes_read"; -const char* IO_MGR_BYTES_WRITTEN = "palo_be.io_mgr.bytes_written"; +// gauges +IntGauge PaloMetrics::memory_pool_bytes_total; -const char* NUM_QUERIES_SPILLED = "palo_be.num_queries_spilled"; +PaloMetrics::PaloMetrics() : _metrics(nullptr), _system_metrics(nullptr) { +} +PaloMetrics::~PaloMetrics() { + delete _system_metrics; + delete _metrics; +} -// These are created by palo_be during startup. -StringProperty* PaloMetrics::_s_palo_be_start_time = NULL; -StringProperty* PaloMetrics::_s_palo_be_version = NULL; -BooleanProperty* PaloMetrics::_s_palo_be_ready = NULL; -IntCounter* PaloMetrics::_s_palo_be_num_fragments = NULL; -IntCounter* PaloMetrics::_s_num_ranges_processed = NULL; -IntCounter* PaloMetrics::_s_num_ranges_missing_volume_id = NULL; -IntGauge* PaloMetrics::_s_mem_pool_total_bytes = NULL; -IntGauge* PaloMetrics::_s_hash_table_total_bytes = NULL; -IntCounter* PaloMetrics::_s_olap_lru_cache_lookup_count = NULL; -IntCounter* PaloMetrics::_s_olap_lru_cache_hit_count = NULL; -IntCounter* PaloMetrics::_s_palo_push_count = NULL; -IntCounter* PaloMetrics::_s_palo_fetch_count = NULL; -IntCounter* PaloMetrics::_s_palo_request_count = NULL; -IntCounter* PaloMetrics::_s_be_merge_delta_num = NULL; -IntCounter* PaloMetrics::_s_be_merge_size = NULL; -IntCounter* PaloMetrics::_s_ce_merge_delta_num = NULL; -IntCounter* PaloMetrics::_s_ce_merge_size = NULL; +void PaloMetrics::initialize(const std::string& name, + bool init_system_metrics, + const std::set& disk_devices, + const std::vector& network_interfaces) { + _metrics = new MetricRegistry(name); +#define REGISTER_PALO_METRIC(name) _metrics->register_metric(#name, &name) -IntGauge* PaloMetrics::_s_io_mgr_num_buffers = NULL; -IntGauge* PaloMetrics::_s_io_mgr_num_open_files = NULL; -IntGauge* PaloMetrics::_s_io_mgr_num_unused_buffers = NULL; -// IntGauge* PaloMetrics::_s_io_mgr_num_cached_file_handles = NULL; -IntGauge* PaloMetrics::_s_io_mgr_num_file_handles_outstanding = NULL; -// IntGauge* PaloMetrics::_s_io_mgr_cached_file_handles_hit_count = NULL; -// IntGauge* PaloMetrics::_s_io_mgr_cached_file_handles_miss_count = NULL; -IntGauge* PaloMetrics::_s_io_mgr_total_bytes = NULL; + // You can put PaloMetrics's metrics initial code here + REGISTER_PALO_METRIC(fragment_requests_total); + REGISTER_PALO_METRIC(fragment_request_duration_us); + REGISTER_PALO_METRIC(http_requests_total); + REGISTER_PALO_METRIC(http_request_duration_us); + REGISTER_PALO_METRIC(http_request_send_bytes); + REGISTER_PALO_METRIC(query_scan_bytes); + REGISTER_PALO_METRIC(query_scan_rows); + REGISTER_PALO_METRIC(ranges_processed_total); -// IntGauge* PaloMetrics::_s_io_mgr_bytes_read = NULL; -// IntGauge* PaloMetrics::_s_io_mgr_local_bytes_read = NULL; -// IntGauge* PaloMetrics::_s_io_mgr_cached_bytes_read = NULL; -// IntGauge* PaloMetrics::_s_io_mgr_short_circuit_bytes_read = NULL; -IntCounter* PaloMetrics::_s_io_mgr_bytes_written = NULL; + // push request + _metrics->register_metric( + "push_requests_total", MetricLabels().add("status", "SUCCESS"), + &push_requests_success_total); + _metrics->register_metric( + "push_requests_total", MetricLabels().add("status", "FAIL"), + &push_requests_fail_total); + REGISTER_PALO_METRIC(push_request_duration_us); + REGISTER_PALO_METRIC(push_request_write_bytes); + REGISTER_PALO_METRIC(push_request_write_rows); -IntCounter* PaloMetrics::_s_num_queries_spilled = NULL; +#define REGISTER_ENGINE_REQUEST_METRIC(type, metric) \ + _metrics->register_metric( \ + "engine_requests_total", MetricLabels().add("type", #type), &metric) -void PaloMetrics::create_metrics(MetricGroup* m) { - // Initialize impalad metrics - _s_palo_be_start_time = m->AddProperty( - PALO_BE_START_TIME, ""); - _s_palo_be_version = m->AddProperty( - PALO_BE_VERSION, get_version_string(true)); - _s_palo_be_ready = m->AddProperty(PALO_BE_READY, false); - _s_palo_be_num_fragments = m->AddCounter(PALO_BE_NUM_FRAGMENTS, 0L); + REGISTER_ENGINE_REQUEST_METRIC(create_tablet, create_tablet_requests_total); + REGISTER_ENGINE_REQUEST_METRIC(drop_tablet, drop_tablet_requests_total); + REGISTER_ENGINE_REQUEST_METRIC(report_all_tablets, report_all_tablets_requests_total); + REGISTER_ENGINE_REQUEST_METRIC(report_tablet, report_tablet_requests_total); + REGISTER_ENGINE_REQUEST_METRIC(schema_change, schema_change_requests_total); + REGISTER_ENGINE_REQUEST_METRIC(create_rollup, create_rollup_requests_total); + REGISTER_ENGINE_REQUEST_METRIC(storage_migrate, storage_migrate_requests_total); + REGISTER_ENGINE_REQUEST_METRIC(delete, delete_requests_total); + REGISTER_ENGINE_REQUEST_METRIC(cancel_delete, cancel_delete_requests_total); - // Initialize scan node metrics - _s_num_ranges_processed = m->AddCounter(TOTAL_SCAN_RANGES_PROCESSED, 0L); - _s_num_ranges_missing_volume_id = m->AddCounter(NUM_SCAN_RANGES_MISSING_VOLUME_ID, 0L); + _metrics->register_metric( + "compaction_deltas_total", MetricLabels().add("type", "base"), + &base_compaction_deltas_total); + _metrics->register_metric( + "compaction_deltas_total", MetricLabels().add("type", "cumulative"), + &cumulative_compaction_deltas_total); + _metrics->register_metric( + "compaction_bytes_total", MetricLabels().add("type", "base"), + &base_compaction_bytes_total); + _metrics->register_metric( + "compaction_bytes_total", MetricLabels().add("type", "cumulative"), + &cumulative_compaction_bytes_total); - // Initialize memory usage metrics - _s_mem_pool_total_bytes = m->AddGauge(MEM_POOL_TOTAL_BYTES, 0L); - _s_hash_table_total_bytes = m->AddGauge(HASH_TABLE_TOTAL_BYTES, 0L); + // Gauge + REGISTER_PALO_METRIC(memory_pool_bytes_total); - // Initialize olap metrics - _s_olap_lru_cache_lookup_count = m->AddCounter(OLAP_LRU_CACHE_LOOKUP_COUNT, 0L); - _s_olap_lru_cache_hit_count = m->AddCounter(OLAP_LRU_CACHE_HIT_COUNT, 0L); - - // Initialize push_count, fetch_count, request_count metrics - _s_palo_push_count = m->AddCounter(PALO_PUSH_COUNT, 0L); - _s_palo_fetch_count = m->AddCounter(PALO_FETCH_COUNT, 0L); - _s_palo_request_count = m->AddCounter(PALO_REQUEST_COUNT, 0L); - - // Initialize be/ce merge metrics - _s_be_merge_delta_num = m->AddCounter(BE_MERGE_DELTA_NUM, 0L); - _s_be_merge_size = m->AddCounter(BE_MERGE_SIZE, 0L); - _s_ce_merge_delta_num = m->AddCounter(CE_MERGE_DELTA_NUM, 0L); - _s_ce_merge_size = m->AddCounter(CE_MERGE_SIZE, 0L); - - // Initialize metrics relate to spilling to disk - // _s_io_mgr_bytes_read - // = m->AddGauge(IO_MGR_BYTES_READ, 0L); - // _s_io_mgr_local_bytes_read - // = m->AddGauge(IO_MGR_LOCAL_BYTES_READ, 0L); - // _s_io_mgr_cached_bytes_read - // = m->AddGauge(IO_MGR_CACHED_BYTES_READ, 0L); - // _s_io_mgr_short_circuit_bytes_read - // = m->AddGauge(IO_MGR_SHORT_CIRCUIT_BYTES_READ, 0L); - _s_io_mgr_bytes_written = m->AddCounter(IO_MGR_BYTES_WRITTEN, 0L); - - _s_io_mgr_num_buffers - = m->AddGauge(IO_MGR_NUM_BUFFERS, 0L); - _s_io_mgr_num_open_files - = m->AddGauge(IO_MGR_NUM_OPEN_FILES, 0L); - _s_io_mgr_num_unused_buffers - = m->AddGauge(IO_MGR_NUM_UNUSED_BUFFERS, 0L); - _s_io_mgr_num_file_handles_outstanding - = m->AddGauge(IO_MGR_NUM_FILE_HANDLES_OUTSTANDING, 0L); - _s_io_mgr_total_bytes - = m->AddGauge(IO_MGR_TOTAL_BYTES, 0L); - - _s_num_queries_spilled = m->AddCounter(NUM_QUERIES_SPILLED, 0L); + if (init_system_metrics) { + _system_metrics = new SystemMetrics(); + _system_metrics->install(_metrics, disk_devices, network_interfaces); + } } } diff --git a/be/src/util/palo_metrics.h b/be/src/util/palo_metrics.h index ccbece8591..6b02ea8a4a 100644 --- a/be/src/util/palo_metrics.h +++ b/be/src/util/palo_metrics.h @@ -1,13 +1,8 @@ -// Modifications copyright (C) 2017, Baidu.com, Inc. -// Copyright 2017 The Apache Software Foundation +// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // @@ -21,143 +16,68 @@ #ifndef BDG_PALO_BE_SRC_COMMON_UTIL_PALO_METRICS_H #define BDG_PALO_BE_SRC_COMMON_UTIL_PALO_METRICS_H +#include +#include +#include + #include "util/metrics.h" namespace palo { -// Global impalad-wide metrics. This is useful for objects that want to update metrics -// without having to do frequent metrics lookups. -// These get created by impala-server from the Metrics ob& disk_devices = std::set(), + const std::vector& network_interfaces = std::vector()); + static PaloMetrics* instance() { return &_s_palo_metrics; } + static MetricRegistry* metrics() { return _s_palo_metrics._metrics; } private: - static StringProperty* _s_palo_be_start_time; - static StringProperty* _s_palo_be_version; - static BooleanProperty* _s_palo_be_ready; - static IntCounter* _s_palo_be_num_fragments; - static IntCounter* _s_num_ranges_processed; - static IntCounter* _s_num_ranges_missing_volume_id; - static IntGauge* _s_mem_pool_total_bytes; - static IntGauge* _s_hash_table_total_bytes; - static IntCounter* _s_olap_lru_cache_lookup_count; - static IntCounter* _s_olap_lru_cache_hit_count; - static IntCounter* _s_palo_push_count; - static IntCounter* _s_palo_fetch_count; - static IntCounter* _s_palo_request_count; - static IntCounter* _s_be_merge_delta_num; - static IntCounter* _s_be_merge_size; - static IntCounter* _s_ce_merge_delta_num; - static IntCounter* _s_ce_merge_size; - - // static IntGauge* _s_io_mgr_bytes_read; - // static IntGauge* _s_io_mgr_local_bytes_read; - // static IntGauge* _s_io_mgr_cached_bytes_read; - // static IntGauge* _s_io_mgr_short_circuit_bytes_read; - static IntCounter* _s_io_mgr_bytes_written; - - static IntGauge* _s_io_mgr_num_buffers; - static IntGauge* _s_io_mgr_num_open_files; - static IntGauge* _s_io_mgr_num_unused_buffers; - // static IntGauge* _s_io_mgr_num_cached_file_handles; - static IntGauge* _s_io_mgr_num_file_handles_outstanding; - // static IntGauge* _s_io_mgr_cached_file_handles_hit_count; - // static IntGauge* _s_io_mgr_cached_file_handles_miss_count; - static IntGauge* _s_io_mgr_total_bytes; - - static IntCounter* _s_num_queries_spilled; + // Don't allow constrctor + PaloMetrics(); +private: + static PaloMetrics _s_palo_metrics; + MetricRegistry* _metrics; + SystemMetrics* _system_metrics; }; }; diff --git a/be/src/util/rpc_channel.cpp b/be/src/util/rpc_channel.cpp new file mode 100644 index 0000000000..fbc17187a2 --- /dev/null +++ b/be/src/util/rpc_channel.cpp @@ -0,0 +1,175 @@ +// Copyright (c) 2018, Baidu.com, Inc. All Rights Reserved + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "util/rpc_channel.h" + +#include "rpc/error.h" +#include "rpc/protocol.h" +#include "rpc/serialization.h" + +namespace palo { + +RpcChannel::RpcChannel(Comm* comm, ConnectionManagerPtr conn_mgr, uint64_t command) + : _comm(comm), _conn_mgr(conn_mgr), _command(command) { +} + +RpcChannel::~RpcChannel() { +} + +Status RpcChannel::init(const std::string& host, int port, + uint32_t connect_timeout_ms, + uint32_t rpc_timeout_ms) { + // Initialize InetAddr + struct sockaddr_in sockaddr_in; + if (!InetAddr::initialize(&sockaddr_in, host.c_str(), port)) { + std::stringstream ss; + ss << "invalid inet address: host=" << host << ", port=" << port; + return Status(ss.str()); + } + _addr.set_inet(sockaddr_in); + + // connect to remote servcie + _connect_timeout_ms = connect_timeout_ms; + _conn_mgr->add(_addr, _connect_timeout_ms, "PaloBeDataStreamMgr"); + + _rpc_timeout_ms = rpc_timeout_ms; + return Status::OK; +} + +Status RpcChannel::get_response(const uint8_t** data, uint32_t* size) { + RETURN_IF_ERROR(_wait_for_last_sent()); + DCHECK(_last_event != nullptr); + *data = _last_event->payload; + *size = _last_event->payload_len; + return Status::OK; +} + +Status RpcChannel::send_message(const uint8_t* data, uint32_t size) { + // make sure + RETURN_IF_ERROR(_wait_for_last_sent()); + return _send_message(data, size); +} + +Status RpcChannel::_send_message(const uint8_t* data, uint32_t size) { + DCHECK(!_rpc_in_flight); + + CommHeader header(_command); + CommBufPtr new_comm_buf = std::make_shared(header, size); + new_comm_buf->append_bytes(data, size); + + auto res = _comm->send_request(_addr, _rpc_timeout_ms, new_comm_buf, this); + if (res != error::OK) { + LOG(WARNING) << "fail to send_request, addr=" << _addr.to_str() + << ", res=" << res << ", message=" << error::get_text(res); + // sleep 10ms to wait ConnectionManager to be notify + usleep(10 * 1000); + _conn_mgr->add(_addr, _connect_timeout_ms, "PaloBeDataStreamMgr"); + bool is_connected = _conn_mgr->wait_for_connection(_addr, _connect_timeout_ms); + if (!is_connected) { + LOG(WARNING) << "fail to wait_for_connection, addr=" << _addr.to_str(); + _conn_mgr->remove(_addr); + _rpc_status = Status(TStatusCode::THRIFT_RPC_ERROR, "connection to remote PaloBe failed"); + return _rpc_status; + } + res = _comm->send_request(_addr, _rpc_timeout_ms, new_comm_buf, this); + if (res != error::OK) { + LOG(WARNING) << "fail to send_request, addr=" << _addr.to_str() + << ", res=" << res << ", message=" << error::get_text(res); + _rpc_status = Status(TStatusCode::THRIFT_RPC_ERROR, "fail to send_request"); + return _rpc_status; + } + } + _cbp = new_comm_buf; + _rpc_in_flight = true; + return Status::OK; +} + +Status RpcChannel::_wait_for_last_sent() { + if (!_rpc_in_flight) { + return _rpc_status; + } + int retry_times = 1; + while (true) { + EventPtr event; + { + std::unique_lock l(_lock); + auto duration = std::chrono::milliseconds(2 * _rpc_timeout_ms); + if (_cond.wait_for(l, duration, [this]() { return !this->_events.empty(); })) { + event = _events.front(); + _events.pop_front(); + } + } + if (event == nullptr) { + LOG(WARNING) << "it's so weird, wait reponse event timeout, request=" + << _cbp->header.id << ", addr=" << _addr.to_str(); + _rpc_in_flight = false; + if (retry_times-- > 0) { + // timeout to receive response, to get user data to resend + const uint8_t* data = nullptr; + uint32_t size = 0; + _cbp->get_user_data(&data, &size); + RETURN_IF_ERROR(_send_message(data, size)); + } else { + LOG(WARNING) << "fail to send batch, _add=" << _addr.to_str() + << ", request_id="<< _cbp->header.id; + _rpc_status = Status(TStatusCode::THRIFT_RPC_ERROR, "fail to send batch"); + break; + } + continue; + } + if (event->type == Event::MESSAGE) { + if (event->header.id != _cbp->header.id) { + LOG(WARNING) << "receive event id not equal with in-flight request, request_id=" + << _cbp->header.id << ", event=" << event->to_str(); + continue; + } + // response recept + _rpc_in_flight = false; + _last_event = event; + return Status::OK; + } else if (event->type == Event::DISCONNECT || event->type == Event::ERROR) { + if (event->header.id != 0 && event->header.id != _cbp->header.id) { + LOG(WARNING) << "receive event id not equal with in-flight request, request_id=" + << _cbp->header.id << ", event=" << event->to_str(); + continue; + } + LOG(WARNING) << "receive response failed, request_id=" << _cbp->header.id + << ", event=" << event->to_str(); + _rpc_in_flight = false; + // error happend when receving response, we need to retry last request + if (retry_times-- > 0) { + // timeout to receive response + const uint8_t* data = nullptr; + uint32_t size = 0; + _cbp->get_user_data(&data, &size); + RETURN_IF_ERROR(_send_message(data, size)); + } else { + LOG(WARNING) << "fail to send batch, request_id="<< _cbp->header.id + << ", event=" << event->to_str(); + _rpc_status = Status(TStatusCode::THRIFT_RPC_ERROR, "fail to send batch"); + break; + } + } else { + _rpc_in_flight = false; + LOG(ERROR) << "recevie unexpect event, event=" << event->to_str(); + _rpc_status = Status(TStatusCode::THRIFT_RPC_ERROR, "fail to send batch"); + break; + } + } + + return _rpc_status; +} + +} diff --git a/be/src/util/rpc_channel.h b/be/src/util/rpc_channel.h new file mode 100644 index 0000000000..66bf892a95 --- /dev/null +++ b/be/src/util/rpc_channel.h @@ -0,0 +1,91 @@ +// Copyright (c) 2018, Baidu.com, Inc. All Rights Reserved + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include +#include +#include +#include + +#include "common/status.h" +#include "rpc/comm.h" +#include "rpc/comm_address.h" +#include "rpc/comm_buf.h" +#include "rpc/connection_manager.h" +#include "rpc/dispatch_handler.h" +#include "rpc/event.h" + +namespace palo { + +// this class is NOT thread safe. +class RpcChannel : public DispatchHandler { +public: + RpcChannel(Comm* comm, ConnectionManagerPtr conn_mgr, uint64_t command); + virtual ~RpcChannel(); + + Status init(const std::string& host, int port, + uint32_t connect_timeout_ms, + uint32_t rpc_timeout_ms); + + // DispatchHandler handle, used to handle request event + void handle(EventPtr& event) override { + { + std::lock_guard l(_lock); + _events.push_back(event); + } + _cond.notify_one(); + } + + Status wait_last_sent() { + return _wait_for_last_sent(); + } + // get last send's response + Status get_response(const uint8_t** data, uint32_t* size); + + // make sure + Status send_message(const uint8_t* data, uint32_t size); + +private: + Status _wait_for_last_sent(); + Status _send_message(const uint8_t* data, uint32_t size); + +private: + Comm* _comm; + ConnectionManagerPtr _conn_mgr; + uint64_t _command; + + CommAddress _addr; + uint32_t _connect_timeout_ms = 500; + uint32_t _rpc_timeout_ms = 1000; + + bool _rpc_in_flight = false; + Status _rpc_status; + + // only valid when _rpc_in_flight is true + CommBufPtr _cbp; + EventPtr _last_event; + + // lock, protect variables + std::mutex _lock; + std::condition_variable _cond; + std::deque _events; +}; + +using RpcChannelPtr = std::shared_ptr; + +} diff --git a/be/src/util/runtime_profile.cpp b/be/src/util/runtime_profile.cpp index dce0f8de7d..97cd49d02b 100644 --- a/be/src/util/runtime_profile.cpp +++ b/be/src/util/runtime_profile.cpp @@ -327,9 +327,22 @@ void RuntimeProfile::compute_time_in_profile(int64_t total) { } } -void RuntimeProfile::add_child(RuntimeProfile* child, bool indent, RuntimeProfile* loc) { - DCHECK(child != NULL); - boost::lock_guard l(_children_lock); +RuntimeProfile* RuntimeProfile::create_child(const std::string& name, bool indent, + bool prepend) { + boost::lock_guard l(_children_lock); + DCHECK(_child_map.find(name) == _child_map.end()); + RuntimeProfile* child = _pool->add(new RuntimeProfile(_pool.get(), name)); + if (_children.empty()) { + add_child_unlock(child, indent, NULL); + } else { + ChildVector::iterator pos = prepend ? _children.begin() : _children.end(); + add_child_unlock(child, indent, (*pos).first); + } + return child; +} + +void RuntimeProfile::add_child_unlock(RuntimeProfile* child, bool indent, RuntimeProfile* loc) { + DCHECK(child != NULL); _child_map[child->_name] = child; if (loc == NULL) { @@ -339,13 +352,17 @@ void RuntimeProfile::add_child(RuntimeProfile* child, bool indent, RuntimeProfil if (it->first == loc) { _children.insert(++it, std::make_pair(child, indent)); return; - } - } - + } + } DCHECK(false) << "Invalid loc"; } } +void RuntimeProfile::add_child(RuntimeProfile* child, bool indent, RuntimeProfile* loc) { + boost::lock_guard l(_children_lock); + add_child_unlock(child, indent, loc); +} + void RuntimeProfile::get_children(std::vector* children) { children->clear(); boost::lock_guard l(_children_lock); diff --git a/be/src/util/runtime_profile.h b/be/src/util/runtime_profile.h index 59eb7abde6..9f5134fd11 100644 --- a/be/src/util/runtime_profile.h +++ b/be/src/util/runtime_profile.h @@ -52,6 +52,8 @@ namespace palo { (profile)->add_counter(name, TUnit::TIME_NS, parent) #define SCOPED_TIMER(c) \ ScopedTimer MACRO_CONCAT(SCOPED_TIMER, __COUNTER__)(c) +#define SCOPED_RAW_TIMER(c) \ + ScopedRawTimer MACRO_CONCAT(SCOPED_RAW_TIMER, __COUNTER__)(c) #define COUNTER_UPDATE(c, v) (c)->update(v) #define COUNTER_SET(c, v) (c)->set(v) #define ADD_THREAD_COUNTERS(profile, prefix) (profile)->add_thread_counters(prefix) @@ -62,6 +64,7 @@ namespace palo { #define ADD_COUNTER(profile, name, type) NULL #define ADD_TIMER(profile, name) NULL #define SCOPED_TIMER(c) +#define SCOPED_RAW_TIMER(c) #define COUNTER_UPDATE(c, v) #define COUNTER_SET(c, v) #define ADD_THREADCOUNTERS(profile, prefix) NULL @@ -82,34 +85,18 @@ class RuntimeProfile { public: class Counter { public: - /* - Counter(TUnit::type type) : - _value(0L), - _type(type) { - } - Counter(TUnit::type type, int64_t value) : - _value(value), - _type(type) { - } - */ - Counter(TUnit::type type, int64_t value = 0) : _value(value), _type(type) { } virtual ~Counter() { } - void update(int64_t delta) { + virtual void update(int64_t delta) { //__sync_fetch_and_add(&_value, delta); _value.add(delta); } // Use this to update if the counter is a bitmap - //void bit_or(int64_t delta) { - // __sync_fetch_and_or(&_value, delta); - //} - - /// Use this to update if the counter is a bitmap void bit_or(int64_t delta) { int64_t old; do { @@ -118,13 +105,13 @@ public: } while (UNLIKELY(!_value.compare_and_swap(old, old | delta))); } - void set(int64_t value) { + virtual void set(int64_t value) { _value.store(value); } - void set(int value) { _value.store(value); } + virtual void set(int value) { _value.store(value); } - void set(double value) { + virtual void set(double value) { DCHECK_EQ(sizeof(value), sizeof(int64_t)); _value.store(*reinterpret_cast(&value)); } @@ -138,12 +125,6 @@ public: return *reinterpret_cast(&v); } - /* - TUnit::type type() const { - return _type; - } - */ - TUnit::type type() const { return _type; } @@ -151,7 +132,6 @@ public: private: friend class RuntimeProfile; - //int64_t _value; AtomicInt64 _value; TUnit::type _type; }; @@ -171,14 +151,14 @@ public: public: HighWaterMarkCounter(TUnit::type unit) : Counter(unit) {} - virtual void Add(int64_t delta) { + virtual void add(int64_t delta) { int64_t new_val = current_value_.add(delta); UpdateMax(new_val); } /// Tries to increase the current value by delta. If current_value() + delta /// exceeds max, return false and current_value is not changed. - bool TryAdd(int64_t delta, int64_t max) { + bool try_add(int64_t delta, int64_t max) { while (true) { int64_t old_val = current_value_.load(); int64_t new_val = old_val + delta; @@ -190,7 +170,7 @@ public: } } - virtual void Set(int64_t v) { + virtual void set(int64_t v) { current_value_.store(v); UpdateMax(v); } @@ -318,6 +298,14 @@ public: // already be added to the profile. void add_child(RuntimeProfile* child, bool indent, RuntimeProfile* location); + void add_child_unlock(RuntimeProfile* child, bool indent, RuntimeProfile* loc); + + /// Creates a new child profile with the given 'name'. A child profile with that name + /// must not already exist. If 'prepend' is true, prepended before other child profiles, + /// otherwise appended after other child profiles. + RuntimeProfile* create_child( + const std::string& name, bool indent = true, bool prepend = false); + // Sorts all children according to a custom comparator. Does not // invalidate pointers to profiles. template @@ -371,6 +359,11 @@ public: // in any of the child profiles to 'counters'. void get_counters(const std::string& name, std::vector* counters); + // Helper to append to the "ExecOption" info string. + void append_exec_option(const std::string& option) { + add_info_string("ExecOption", option); + } + // Adds a string to the runtime profile. If a value already exists for 'key', // the value will be updated. void add_info_string(const std::string& key, const std::string& value); @@ -723,6 +716,28 @@ private: RuntimeProfile::Counter* _counter; }; +// Utility class to update time elapsed when the object goes out of scope. +// 'T' must implement the stopWatch "interface" (start,stop,elapsed_time) but +// we use templates not to pay for virtual function overhead. +template +class ScopedRawTimer { +public: + ScopedRawTimer(int64_t* counter) : _counter(counter) { + _sw.start(); + } + // Update counter when object is destroyed + ~ScopedRawTimer() { + *_counter += _sw.elapsed_time(); + } +private: + // Disable copy constructor and assignment + ScopedRawTimer(const ScopedRawTimer& timer); + ScopedRawTimer& operator=(const ScopedRawTimer& timer); + + T _sw; + int64_t* _counter; +}; + } #endif diff --git a/be/src/util/system_metrics.cpp b/be/src/util/system_metrics.cpp new file mode 100644 index 0000000000..15ccffbeac --- /dev/null +++ b/be/src/util/system_metrics.cpp @@ -0,0 +1,387 @@ +// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "util/system_metrics.h" + +#include +#include + +#include + +namespace palo { + +const char* SystemMetrics::_s_hook_name = "system_metrics"; + +// /proc/stat: http://www.linuxhowtos.org/System/procstat.htm +struct CpuMetrics { + static constexpr int k_num_metrics = 10; + static const char* k_names[k_num_metrics]; + IntLockCounter metrics[k_num_metrics]; +}; + +const char* CpuMetrics::k_names[] = { + "user", "nice", "system", "idle", "iowait", + "irq", "soft_irq", "steal", "guest", "guest_nice"}; + +struct MemoryMetrics { + IntGauge allocated_bytes; +}; + +struct DiskMetrics { + IntLockCounter reads_completed; + IntLockCounter bytes_read; + IntLockCounter read_time_ms; + IntLockCounter writes_completed; + IntLockCounter bytes_written; + IntLockCounter write_time_ms; + IntLockCounter io_time_ms; + IntLockCounter io_time_weigthed; +}; + +struct NetMetrics { + IntLockCounter receive_bytes; + IntLockCounter receive_packets; + IntLockCounter send_bytes; + IntLockCounter send_packets; +}; + +SystemMetrics::SystemMetrics() { +} + +SystemMetrics::~SystemMetrics() { + // we must deregister us from registry + if (_registry != nullptr) { + _registry->deregister_hook(_s_hook_name); + _registry = nullptr; + } + for (auto& it : _disk_metrics) { + delete it.second; + } + for (auto& it : _net_metrics) { + delete it.second; + } + if (_line_ptr != 0) { + free(_line_ptr); + } +} + +void SystemMetrics::install(MetricRegistry* registry, + const std::set& disk_devices, + const std::vector& network_interfaces) { + DCHECK(_registry == nullptr); + if (!registry->register_hook(_s_hook_name, std::bind(&SystemMetrics::update, this))) { + return; + } + _install_cpu_metrics(registry); + _install_memory_metrics(registry); + _install_disk_metrics(registry, disk_devices); + _install_net_metrics(registry, network_interfaces); + _registry = registry; +} + +void SystemMetrics::update() { + _update_cpu_metrics(); + _update_memory_metrics(); + _update_disk_metrics(); + _update_net_metrics(); +} + +void SystemMetrics::_install_cpu_metrics(MetricRegistry* registry) { + _cpu_total.reset(new CpuMetrics()); + + for (int i = 0; i < CpuMetrics::k_num_metrics; ++i) { + registry->register_metric("cpu", + MetricLabels().add("mode", CpuMetrics::k_names[i]), + &_cpu_total->metrics[i]); + } +} + +#ifdef BE_TEST +const char* k_ut_stat_path; +const char* k_ut_diskstats_path; +const char* k_ut_net_dev_path; +#endif + +void SystemMetrics::_update_cpu_metrics() { +#ifdef BE_TEST + FILE* fp = fopen(k_ut_stat_path, "r"); +#else + FILE* fp = fopen("/proc/stat", "r"); +#endif + if (fp == nullptr) { + char buf[64]; + LOG(WARNING) << "open /proc/stat failed, errno=" << errno + << ", message=" << strerror_r(errno, buf, 64); + return; + } + + if (getline(&_line_ptr, &_line_buf_size, fp) < 0) { + char buf[64]; + LOG(WARNING) << "geline failed, errno=" << errno + << ", message=" << strerror_r(errno, buf, 64); + fclose(fp); + return; + } + + char cpu[16]; + int64_t values[CpuMetrics::k_num_metrics]; + memset(values, 0, sizeof(values)); + sscanf(_line_ptr, "%15s" + " %" PRId64 " %" PRId64 " %" PRId64 + " %" PRId64 " %" PRId64 " %" PRId64 + " %" PRId64 " %" PRId64 " %" PRId64 + " %" PRId64, + cpu, + &values[0], &values[1], &values[2], + &values[3], &values[4], &values[5], + &values[6], &values[7], &values[8], + &values[9]); + + for (int i = 0; i < CpuMetrics::k_num_metrics; ++i) { + _cpu_total->metrics[i].set_value(values[i]); + } + + fclose(fp); +} + +void SystemMetrics::_install_memory_metrics(MetricRegistry* registry) { + _memory_metrics.reset(new MemoryMetrics()); + + registry->register_metric("memory_allocated_bytes", &_memory_metrics->allocated_bytes); +} + +void SystemMetrics::_update_memory_metrics() { +#if defined(ADDRESS_SANITIZER) || defined(LEAK_SANITIZER) || defined(THREAD_SANITIZER) + LOG(INFO) << "Memory tracking is not available with address sanitizer builds."; +#else + size_t allocated_bytes = 0; + MallocExtension::instance()->GetNumericProperty( + "generic.current_allocated_bytes", &allocated_bytes); + _memory_metrics->allocated_bytes.set_value(allocated_bytes); +#endif +} + +void SystemMetrics::_install_disk_metrics(MetricRegistry* registry, + const std::set& devices) { + for (auto& disk : devices) { + DiskMetrics* metrics = new DiskMetrics(); +#define REGISTER_DISK_METRIC(name) \ + registry->register_metric("disk_"#name, \ + MetricLabels().add("device", disk), \ + &metrics->name) + REGISTER_DISK_METRIC(reads_completed); + REGISTER_DISK_METRIC(bytes_read); + REGISTER_DISK_METRIC(read_time_ms); + REGISTER_DISK_METRIC(writes_completed); + REGISTER_DISK_METRIC(bytes_written); + REGISTER_DISK_METRIC(write_time_ms); + REGISTER_DISK_METRIC(io_time_ms); + REGISTER_DISK_METRIC(io_time_weigthed); + _disk_metrics.emplace(disk, metrics); + } +} + +void SystemMetrics::_update_disk_metrics() { +#ifdef BE_TEST + FILE* fp = fopen(k_ut_diskstats_path, "r"); +#else + FILE* fp = fopen("/proc/diskstats", "r"); +#endif + if (fp == nullptr) { + char buf[64]; + LOG(WARNING) << "open /proc/diskstats failed, errno=" << errno + << ", message=" << strerror_r(errno, buf, 64); + return; + } + + // /proc/diskstats: https://www.kernel.org/doc/Documentation/ABI/testing/procfs-diskstats + // 1 - major number + // 2 - minor mumber + // 3 - device name + // 4 - reads completed successfully + // 5 - reads merged + // 6 - sectors read + // 7 - time spent reading (ms) + // 8 - writes completed + // 9 - writes merged + // 10 - sectors written + // 11 - time spent writing (ms) + // 12 - I/Os currently in progress + // 13 - time spent doing I/Os (ms) + // 14 - weighted time spent doing I/Os (ms) + // I think 1024 is enougth for device name + int major = 0; + int minor = 0; + char device[1024]; + int64_t values[11]; + while (getline(&_line_ptr, &_line_buf_size, fp) > 0) { + memset(values, 0, sizeof(values)); + int num = sscanf(_line_ptr, "%d %d %1023s" + " %" PRId64 " %" PRId64 " %" PRId64 + " %" PRId64 " %" PRId64 " %" PRId64 + " %" PRId64 " %" PRId64 " %" PRId64 + " %" PRId64 " %" PRId64, + &major, &minor, device, + &values[0], &values[1], &values[2], + &values[3], &values[4], &values[5], + &values[6], &values[7], &values[8], + &values[9], &values[10]); + if (num < 4) { + continue; + } + auto it = _disk_metrics.find(device); + if (it == std::end(_disk_metrics)) { + continue; + } + // update disk metrics + // reads_completed: 4 reads completed successfully + it->second->reads_completed.set_value(values[0]); + // bytes_read: 6 sectors read * 512; 5 reads merged is ignored + it->second->bytes_read.set_value(values[2] * 512); + // read_time_ms: 7 time spent reading (ms) + it->second->read_time_ms.set_value(values[3]); + // writes_completed: 8 writes completed + it->second->writes_completed.set_value(values[4]); + // bytes_written: 10 sectors write * 512; 9 writes merged is ignored + it->second->bytes_written.set_value(values[6] * 512); + // write_time_ms: 11 time spent writing (ms) + it->second->write_time_ms.set_value(values[7]); + // io_time_ms: 13 time spent doing I/Os (ms) + it->second->io_time_ms.set_value(values[9]); + // io_time_weigthed: 14 - weighted time spent doing I/Os (ms) + it->second->io_time_weigthed.set_value(values[10]); + } + if (ferror(fp) != 0) { + char buf[64]; + LOG(WARNING) << "getline failed, errno=" << errno + << ", message=" << strerror_r(errno, buf, 64); + return; + } + fclose(fp); +} + +void SystemMetrics::_install_net_metrics(MetricRegistry* registry, + const std::vector& interfaces) { + for (auto& net : interfaces) { + NetMetrics* metrics = new NetMetrics(); +#define REGISTER_NETWORK_METRIC(name) \ + registry->register_metric("network_"#name, \ + MetricLabels().add("device", net), \ + &metrics->name) + REGISTER_NETWORK_METRIC(receive_bytes); + REGISTER_NETWORK_METRIC(receive_packets); + REGISTER_NETWORK_METRIC(send_bytes); + REGISTER_NETWORK_METRIC(send_packets); + _net_metrics.emplace(net, metrics); + } +} + +void SystemMetrics::_update_net_metrics() { +#ifdef BE_TEST + // to mock proc + FILE* fp = fopen(k_ut_net_dev_path, "r"); +#else + FILE* fp = fopen("/proc/net/dev", "r"); +#endif + if (fp == nullptr) { + char buf[64]; + LOG(WARNING) << "open /proc/net/dev failed, errno=" << errno + << ", message=" << strerror_r(errno, buf, 64); + return; + } + + // Ignore header + if (getline(&_line_ptr, &_line_buf_size, fp) < 0 || + getline(&_line_ptr, &_line_buf_size, fp) < 0) { + char buf[64]; + LOG(WARNING) << "read /proc/net/dev first two line failed, errno=" << errno + << ", message=" << strerror_r(errno, buf, 64); + fclose(fp); + return; + } + if (_proc_net_dev_version == 0) { + if (strstr(_line_ptr, "compressed") != nullptr) { + _proc_net_dev_version = 3; + } else if (strstr(_line_ptr, "bytes") != nullptr) { + _proc_net_dev_version = 2; + } else { + _proc_net_dev_version = 1; + } + } + + while (getline(&_line_ptr, &_line_buf_size, fp) > 0) { + char* ptr = strrchr(_line_ptr, ':'); + if (ptr == nullptr) { + continue; + } + char *start = _line_ptr; + while (isspace(*start)) { + start++; + } + std::string interface(start, ptr - start); + auto it = _net_metrics.find(interface); + if (it == std::end(_net_metrics)) { + continue; + } + ptr++; + int64_t receive_bytes = 0; + int64_t receive_packets = 0; + int64_t send_bytes = 0; + int64_t send_packets = 0; + switch (_proc_net_dev_version) { + case 3: + // receive: bytes packets errs drop fifo frame compressed multicast + // send: bytes packets errs drop fifo colls carrier compressed + sscanf(ptr, + " %" PRId64 " %" PRId64 " %*d %*d %*d %*d %*d %*d" + " %" PRId64 " %" PRId64 " %*d %*d %*d %*d %*d %*d", + &receive_bytes, &receive_packets, + &send_bytes, &send_packets); + break; + case 2: + // receive: bytes packets errs drop fifo frame + // send: bytes packets errs drop fifo colls carrier + sscanf(ptr, + " %" PRId64 " %" PRId64 " %*d %*d %*d %*d" + " %" PRId64 " %" PRId64 " %*d %*d %*d %*d %*d", + &receive_bytes, &receive_packets, + &send_bytes, &send_packets); + break; + case 1: + // receive: packets errs drop fifo frame + // send: packets errs drop fifo colls carrier + sscanf(ptr, + " %" PRId64 " %*d %*d %*d %*d" + " %" PRId64 " %*d %*d %*d %*d %*d", + &receive_packets, &send_packets); + break; + default: + break; + } + it->second->receive_bytes.set_value(receive_bytes); + it->second->receive_packets.set_value(receive_packets); + it->second->send_bytes.set_value(send_bytes); + it->second->send_packets.set_value(send_packets); + } + if (ferror(fp) != 0) { + char buf[64]; + LOG(WARNING) << "getline failed, errno=" << errno + << ", message=" << strerror_r(errno, buf, 64); + return; + } + fclose(fp); +} + +} diff --git a/be/src/util/system_metrics.h b/be/src/util/system_metrics.h new file mode 100644 index 0000000000..67400f3a95 --- /dev/null +++ b/be/src/util/system_metrics.h @@ -0,0 +1,71 @@ +// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "util/metrics.h" + +#include + +namespace palo { + +class CpuMetrics; +class MemoryMetrics; +class DiskMetrics; +class NetMetrics; + +class SystemMetrics { +public: + SystemMetrics(); + ~SystemMetrics(); + + // install system metrics to registry + void install(MetricRegistry* registry, + const std::set& disk_devices, + const std::vector& network_interfaces); + + // update metrics + void update(); + +private: + void _install_cpu_metrics(MetricRegistry*); + // On Intel(R) Xeon(R) CPU E5-2450 0 @ 2.10GHz; + // read /proc/stat would cost about 170us + void _update_cpu_metrics(); + + void _install_memory_metrics(MetricRegistry* registry); + void _update_memory_metrics(); + + void _install_disk_metrics(MetricRegistry* registry, + const std::set& devices); + void _update_disk_metrics(); + + void _install_net_metrics(MetricRegistry* registry, + const std::vector& interfaces); + void _update_net_metrics(); + +private: + static const char* _s_hook_name; + + std::unique_ptr _cpu_total; + std::unique_ptr _memory_metrics; + std::map _disk_metrics; + std::map _net_metrics; + int _proc_net_dev_version = 0; + + char* _line_ptr = nullptr; + size_t _line_buf_size = 0; + MetricRegistry* _registry = nullptr; +}; + +} diff --git a/be/src/util/thrift_server.cpp b/be/src/util/thrift_server.cpp index a218eea9db..0239d2959f 100644 --- a/be/src/util/thrift_server.cpp +++ b/be/src/util/thrift_server.cpp @@ -225,8 +225,8 @@ void* ThriftServer::ThriftServerEventProcessor::createContext( } if (_thrift_server->_metrics_enabled) { - _thrift_server->_num_current_connections_metric->increment(1L); - _thrift_server->_total_connections_metric->increment(1L); + _thrift_server->_connections_total->increment(1L); + _thrift_server->_current_connections->increment(1L); } // Store the _session_key in the per-client context to avoid recomputing @@ -257,7 +257,7 @@ void ThriftServer::ThriftServerEventProcessor::deleteContext( } if (_thrift_server->_metrics_enabled) { - _thrift_server->_num_current_connections_metric->increment(-1L); + _thrift_server->_current_connections->increment(-1L); } } @@ -265,7 +265,7 @@ ThriftServer::ThriftServer( const std::string& name, const boost::shared_ptr& processor, int port, - MetricGroup* metrics, + MetricRegistry* metrics, int num_worker_threads, ServerType server_type) : _started(false), @@ -279,13 +279,15 @@ ThriftServer::ThriftServer( _session_handler(NULL) { if (metrics != NULL) { _metrics_enabled = true; - std::stringstream count_ss; - count_ss << "palo_be.thrift_server." << name << ".connections_in_use"; - _num_current_connections_metric = - metrics->AddGauge(count_ss.str(), 0L); - std::stringstream max_ss; - max_ss << "palo_be.thrift_server." << name << ".total_connections"; - _total_connections_metric = metrics->AddCounter(max_ss.str(), 0L); + _current_connections.reset(new IntGauge()); + metrics->register_metric("thrift_current_connections", + MetricLabels().add("name", name), + _current_connections.get()); + + _connections_total.reset(new IntCounter()); + metrics->register_metric("thrift_connections_total", + MetricLabels().add("name", name), + _connections_total.get()); } else { _metrics_enabled = false; } diff --git a/be/src/util/thrift_server.h b/be/src/util/thrift_server.h index eebdc0be42..ea58cf179c 100644 --- a/be/src/util/thrift_server.h +++ b/be/src/util/thrift_server.h @@ -75,7 +75,7 @@ public: // - server_type: the type of IO strategy this server should employ ThriftServer(const std::string& name, const boost::shared_ptr& processor, int port, - MetricGroup* metrics = NULL, int num_worker_threads = DEFAULT_WORKER_THREADS, + MetricRegistry* metrics = NULL, int num_worker_threads = DEFAULT_WORKER_THREADS, ServerType server_type = THREADED); ~ThriftServer() { } @@ -147,10 +147,10 @@ private: bool _metrics_enabled; // Number of currently active connections - IntGauge* _num_current_connections_metric; + std::unique_ptr _current_connections; // Total connections made over the lifetime of this server - IntCounter* _total_connections_metric; + std::unique_ptr _connections_total; // Helper class which monitors starting servers. Needs access to internal members, and // is not used outside of this class. diff --git a/be/src/util/time.cpp b/be/src/util/time.cpp new file mode 100755 index 0000000000..7e47151266 --- /dev/null +++ b/be/src/util/time.cpp @@ -0,0 +1,139 @@ +// Modifications copyright (C) 2017, Baidu.com, Inc. +// Copyright 2017 The Apache Software Foundation + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include + +#include "util/time.h" + +using namespace palo; +using namespace std; + +void palo::SleepForMs(const int64_t duration_ms) { + this_thread::sleep_for(chrono::milliseconds(duration_ms)); +} + +// Convert the given time_point, 't', into a date-time string in the +// UTC time zone if 'utc' is true, or the local time zone if it is false. +// The returned string is of the form yyy-MM-dd HH::mm::SS. +static string TimepointToString(const chrono::system_clock::time_point& t, + bool utc) { + char buf[256]; + struct tm tmp; + auto input_time = chrono::system_clock::to_time_t(t); + + // gcc 4.9 does not support C++14 get_time and put_time functions, so we're + // stuck with strftime() for now. + if (utc) { + strftime(buf, sizeof(buf), "%F %T", gmtime_r(&input_time, &tmp)); + } else { + strftime(buf, sizeof(buf), "%F %T", localtime_r(&input_time, &tmp)); + } + return string(buf); +} + +// Format the sub-second part of the input time point object 't', at the +// precision specified by 'p'. The returned string is meant to be appended to +// the string returned by TimePointToString() above. +// Note the use of abs(). This is to make sure we correctly format negative times, +// i.e., times before the Unix epoch. +static string FormatSubSecond(const chrono::system_clock::time_point& t, + TimePrecision p) { + stringstream ss; + auto frac = t.time_since_epoch(); + if (p == TimePrecision::Millisecond) { + auto subsec = chrono::duration_cast(frac) % MILLIS_PER_SEC; + ss << "." << std::setfill('0') << std::setw(3) << abs(subsec.count()); + } else if (p == TimePrecision::Microsecond) { + auto subsec = chrono::duration_cast(frac) % MICROS_PER_SEC; + ss << "." << std::setfill('0') << std::setw(6) << abs(subsec.count()); + } else if (p == TimePrecision::Nanosecond) { + auto subsec = chrono::duration_cast(frac) % NANOS_PER_SEC; + ss << "." << std::setfill('0') << std::setw(9) << abs(subsec.count()); + } else { + // 1-second precision or unknown unit. Return empty string. + DCHECK_EQ(TimePrecision::Second, p); + ss << ""; + } + return ss.str(); +} + +// Convert time point 't' into date-time string at precision 'p'. +// Output string is in UTC time zone if 'utc' is true, else it is in the +// local time zone. +static string ToString(const chrono::system_clock::time_point& t, TimePrecision p, + bool utc) +{ + stringstream ss; + ss << TimepointToString(t, utc); + ss << FormatSubSecond(t, p); + return ss.str(); +} + +// Convenience function to convert Unix time, specified as seconds since +// the Unix epoch, into a C++ time_point object. +static chrono::system_clock::time_point TimepointFromUnix(int64_t s) { + return chrono::system_clock::time_point(chrono::seconds(s)); +} + +// Convenience function to convert Unix time, specified as milliseconds since +// the Unix epoch, into a C++ time_point object. +static chrono::system_clock::time_point TimepointFromUnixMillis(int64_t ms) { + return chrono::system_clock::time_point(chrono::milliseconds(ms)); +} + +// Convenience function to convert Unix time, specified as microseconds since +// the Unix epoch, into a C++ time_point object. +static chrono::system_clock::time_point TimepointFromUnixMicros(int64_t us) { + return chrono::system_clock::time_point(chrono::microseconds(us)); +} + +string palo::ToStringFromUnix(int64_t s, TimePrecision p) { + chrono::system_clock::time_point t = TimepointFromUnix(s); + return ToString(t, p, false); +} + +string palo::ToUtcStringFromUnix(int64_t s, TimePrecision p) { + chrono::system_clock::time_point t = TimepointFromUnix(s); + return ToString(t, p, true); +} + +string palo::ToStringFromUnixMillis(int64_t ms, TimePrecision p) { + chrono::system_clock::time_point t = TimepointFromUnixMillis(ms); + return ToString(t, p, false); +} + +string palo::ToUtcStringFromUnixMillis(int64_t ms, TimePrecision p) { + chrono::system_clock::time_point t = TimepointFromUnixMillis(ms); + return ToString(t, p, true); +} + +string palo::ToStringFromUnixMicros(int64_t us, TimePrecision p) { + chrono::system_clock::time_point t = TimepointFromUnixMicros(us); + return ToString(t, p, false); +} + +string palo::ToUtcStringFromUnixMicros(int64_t us, TimePrecision p) { + chrono::system_clock::time_point t = TimepointFromUnixMicros(us); + return ToString(t, p, true); +} diff --git a/be/src/util/time.h b/be/src/util/time.h new file mode 100755 index 0000000000..651f610dea --- /dev/null +++ b/be/src/util/time.h @@ -0,0 +1,110 @@ +// Modifications copyright (C) 2017, Baidu.com, Inc. +// Copyright 2017 The Apache Software Foundation + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef BDG_PALO_BE_UTIL_TIME_H +#define BDG_PALO_BE_UTIL_TIME_H + +#include +#include +#include + +#include "gutil/walltime.h" + +/// Utilities for collecting timings. +namespace palo { + +/// Returns a value representing a point in time that is unaffected by daylight savings or +/// manual adjustments to the system clock. This should not be assumed to be a Unix +/// time. Typically the value corresponds to elapsed time since the system booted. See +/// UnixMillis() below if you need to send a time to a different host. +inline int64_t MonotonicNanos() { + timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return ts.tv_sec * NANOS_PER_SEC + ts.tv_nsec; +} + +inline int64_t MonotonicMicros() { // 63 bits ~= 5K years uptime + return GetMonoTimeMicros(); +} + +inline int64_t MonotonicMillis() { + return GetMonoTimeMicros() / MICROS_PER_MILLI; +} + +inline int64_t MonotonicSeconds() { + return GetMonoTimeMicros() / MICROS_PER_SEC; +} + +/// Returns the number of milliseconds that have passed since the Unix epoch. This is +/// affected by manual changes to the system clock but is more suitable for use across +/// a cluster. For more accurate timings on the local host use the monotonic functions +/// above. +inline int64_t UnixMillis() { + return GetCurrentTimeMicros() / MICROS_PER_MILLI; +} + +/// Returns the number of microseconds that have passed since the Unix epoch. This is +/// affected by manual changes to the system clock but is more suitable for use across +/// a cluster. For more accurate timings on the local host use the monotonic functions +/// above. +inline int64_t UnixMicros() { + return GetCurrentTimeMicros(); +} + +/// Sleeps the current thread for at least duration_ms milliseconds. +void SleepForMs(const int64_t duration_ms); + +// An enum class to use as precision argument for the ToString*() functions below +enum TimePrecision { + Second, + Millisecond, + Microsecond, + Nanosecond +}; + +/// Converts the input Unix time, 's', specified in seconds since the Unix epoch, to a +/// date-time string in the local time zone. The precision in the output date-time string +/// is specified by the second argument, 'p'. The returned string is of the format +/// yyyy-MM-dd HH:mm:SS[.ms[us[ns]]. It's worth noting that if the precision specified +/// by 'p' is higher than that of the input timestamp, the part corresponding to +/// 'p' in the fractional second part of the output will just be zero-padded. +std::string ToStringFromUnix(int64_t s, TimePrecision p = TimePrecision::Second); + +/// Converts input seconds-since-epoch to date-time string in UTC time zone. +std::string ToUtcStringFromUnix(int64_t s, TimePrecision p = TimePrecision::Second); + +/// Converts input milliseconds-since-epoch to date-time string in local time zone. +std::string ToStringFromUnixMillis(int64_t ms, + TimePrecision p = TimePrecision::Millisecond); + +/// Converts input milliseconds-since-epoch to date-time string in UTC time zone. +std::string ToUtcStringFromUnixMillis(int64_t ms, + TimePrecision p = TimePrecision::Millisecond); + +/// Converts input microseconds-since-epoch to date-time string in local time zone. +std::string ToStringFromUnixMicros(int64_t us, + TimePrecision p = TimePrecision::Microsecond); + +/// Converts input microseconds-since-epoch to date-time string in UTC time zone. +std::string ToUtcStringFromUnixMicros(int64_t us, + TimePrecision p = TimePrecision::Microsecond); + +} // namespace impala +#endif diff --git a/be/src/util/types.h b/be/src/util/types.h new file mode 100644 index 0000000000..3b76e3af3b --- /dev/null +++ b/be/src/util/types.h @@ -0,0 +1,34 @@ +// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +namespace palo { + +// Because __int128 in memory is not aligned, but GCC7 will generate SSE instruction +// for __int128 load/store. This will cause segment fault. +struct PackedInt128 { + PackedInt128& operator=(const __int128& value_) { + value = value_; + return *this; + } + PackedInt128& operator=(const PackedInt128& rhs) { + value = rhs.value; + return *this; + } + __int128 value; +} __attribute__((packed)); + +} diff --git a/be/test/agent/mock_task_worker_pool.h b/be/test/agent/mock_task_worker_pool.h index 5585257669..004b825f98 100644 --- a/be/test/agent/mock_task_worker_pool.h +++ b/be/test/agent/mock_task_worker_pool.h @@ -22,7 +22,7 @@ namespace palo { const uint32_t TASK_FINISH_MAX_RETRY = 3; -const uint32_t PUSH_MAX_RETRY = 3; +const uint32_t PUSH_MAX_RETRY = 1; const uint32_t REPORT_TASK_WORKER_COUNT = 1; const uint32_t REPORT_DISK_STATE_WORKER_COUNT = 1; const uint32_t REPORT_OLAP_TABLE_WORKER_COUNT = 1; diff --git a/be/test/agent/task_worker_pool_test.cpp b/be/test/agent/task_worker_pool_test.cpp index 804efa5ccf..386e28de1d 100644 --- a/be/test/agent/task_worker_pool_test.cpp +++ b/be/test/agent/task_worker_pool_test.cpp @@ -1322,12 +1322,14 @@ TEST(TaskWorkerPoolTest, TestReportDiskState) { task_worker_pool._master_client = &mock_master_server_client; // Get root path failed, report failed +#if 0 EXPECT_CALL(mock_command_executor, get_all_root_path_stat(_)) .Times(1) .WillOnce(Return(OLAPStatus::OLAP_ERR_OTHER_ERROR)); EXPECT_CALL(mock_master_server_client, report(_, _)) .Times(0); task_worker_pool._report_disk_state_worker_thread_callback(&task_worker_pool); +#endif // Get root path success, report failed EXPECT_CALL(mock_command_executor, get_all_root_path_stat(_)) diff --git a/be/test/agent/utils_test.cpp b/be/test/agent/utils_test.cpp index db1645f8fe..b40620248f 100644 --- a/be/test/agent/utils_test.cpp +++ b/be/test/agent/utils_test.cpp @@ -42,8 +42,8 @@ int main(int argc, char **argv) { return -1; } - palo::BackendOptions::init(); palo::init_glog("be-test"); + palo::BackendOptions::init(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } diff --git a/be/test/http/CMakeLists.txt b/be/test/http/CMakeLists.txt new file mode 100644 index 0000000000..fb08d9f75c --- /dev/null +++ b/be/test/http/CMakeLists.txt @@ -0,0 +1,19 @@ +# Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# where to put generated libraries +set(EXECUTABLE_OUTPUT_PATH "${BUILD_DIR}/test/http") + +ADD_BE_TEST(metrics_action_test) diff --git a/be/test/http/metrics_action_test.cpp b/be/test/http/metrics_action_test.cpp new file mode 100644 index 0000000000..5c6b47eb5c --- /dev/null +++ b/be/test/http/metrics_action_test.cpp @@ -0,0 +1,101 @@ +// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "http/action/metrics_action.h" + +#include + +#include "http/http_request.h" +#include "http/http_response.h" +#include "http/http_channel.h" +#include "util/metrics.h" + +namespace palo { + +// Mock part +const char* s_expect_response = nullptr; + +HttpChannel::HttpChannel(const HttpRequest& request, mg_connection* mg_conn) : + _request(request), + _mg_conn(mg_conn) { +} + +void HttpChannel::send_response(const HttpResponse& response) { + ASSERT_STREQ(s_expect_response, response.content()->c_str()); +} + +HttpRequest::HttpRequest(mg_connection* conn) { +} + +class MetricsActionTest : public testing::Test { +public: + MetricsActionTest() { } + virtual ~MetricsActionTest() { + } +}; + +TEST_F(MetricsActionTest, prometheus_output) { + MetricRegistry registry("test"); + IntGauge cpu_idle; + cpu_idle.set_value(50); + registry.register_metric("cpu_idle", &cpu_idle); + IntCounter put_requests_total; + put_requests_total.increment(2345); + registry.register_metric("requests_total", + MetricLabels().add("type", "put").add("path", "/sports"), + &put_requests_total); + s_expect_response = + "# TYPE test_cpu_idle GAUGE\n" + "test_cpu_idle 50\n" + "# TYPE test_requests_total COUNTER\n" + "test_requests_total{path=\"/sports\",type=\"put\"} 2345\n"; + HttpRequest request(nullptr); + HttpChannel channel(request, nullptr); + MetricsAction action(®istry); + action.handle(&request, &channel); +} + +TEST_F(MetricsActionTest, prometheus_no_prefix) { + MetricRegistry registry(""); + IntGauge cpu_idle; + cpu_idle.set_value(50); + registry.register_metric("cpu_idle", &cpu_idle); + s_expect_response = + "# TYPE cpu_idle GAUGE\n" + "cpu_idle 50\n"; + HttpRequest request(nullptr); + HttpChannel channel(request, nullptr); + MetricsAction action(®istry); + action.handle(&request, &channel); +} + +TEST_F(MetricsActionTest, prometheus_no_name) { + MetricRegistry registry("test"); + IntGauge cpu_idle; + cpu_idle.set_value(50); + registry.register_metric("", &cpu_idle); + s_expect_response = ""; + HttpRequest request(nullptr); + HttpChannel channel(request, nullptr); + MetricsAction action(®istry); + action.handle(&request, &channel); +} + +} + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/be/test/olap/CMakeLists.txt b/be/test/olap/CMakeLists.txt index 1d81c66342..af6a9bfc3d 100644 --- a/be/test/olap/CMakeLists.txt +++ b/be/test/olap/CMakeLists.txt @@ -4,19 +4,25 @@ set(LIBRARY_OUTPUT_PATH "${BUILD_DIR}/test/olap") # where to put generated binaries set(EXECUTABLE_OUTPUT_PATH "${BUILD_DIR}/test/olap") -#ADD_BE_TEST(row_block_test) +ADD_BE_TEST(row_block_test) ADD_BE_TEST(command_executor_test) -#ADD_BE_TEST(olap_reader_test) -#ADD_BE_TEST(vectorized_olap_reader_test) ADD_BE_TEST(bit_field_test) ADD_BE_TEST(byte_buffer_test) -ADD_BE_TEST(column_reader_test) ADD_BE_TEST(run_length_byte_test) ADD_BE_TEST(run_length_integer_test) ADD_BE_TEST(stream_index_test) ADD_BE_TEST(lru_cache_test) -ADD_BE_TEST(delete_handler_test) -ADD_BE_TEST(file_helper_test) -ADD_BE_TEST(file_utils_test) ADD_BE_TEST(bloom_filter_test) ADD_BE_TEST(bloom_filter_index_test) +ADD_BE_TEST(comparison_predicate_test) +ADD_BE_TEST(in_list_predicate_test) +ADD_BE_TEST(null_predicate_test) +ADD_BE_TEST(file_helper_test) +ADD_BE_TEST(file_utils_test) +ADD_BE_TEST(delete_handler_test) +ADD_BE_TEST(column_reader_test) +ADD_BE_TEST(row_cursor_test) + +## deleted +# ADD_BE_TEST(olap_reader_test) +# ADD_BE_TEST(vectorized_olap_reader_test) diff --git a/be/test/olap/bit_field_test.cpp b/be/test/olap/bit_field_test.cpp index 3c9ca674e5..c4c83de3c4 100755 --- a/be/test/olap/bit_field_test.cpp +++ b/be/test/olap/bit_field_test.cpp @@ -70,7 +70,8 @@ public: 0, _helper.length(), NULL, - OLAP_DEFAULT_COLUMN_STREAM_BUFFER_SIZE); + OLAP_DEFAULT_COLUMN_STREAM_BUFFER_SIZE, + &_stats); ASSERT_EQ(OLAP_SUCCESS, _stream->init()); _reader = new (std::nothrow) BitFieldReader(_stream); @@ -84,6 +85,7 @@ public: FileHandler _helper; ByteBuffer* _shared_buffer; ReadOnlyFileStream* _stream; + OlapReaderStatistics _stats; }; TEST_F(TestBitField, ReadWriteOneBit) { @@ -147,7 +149,7 @@ TEST_F(TestBitField, Seek) { PositionEntryReader entry; entry._positions = index_entry._positions; entry._positions_count = index_entry._positions_count; - entry._statistics.init(OLAP_FIELD_TYPE_NONE, false); + entry._statistics.init(OLAP_FIELD_TYPE_TINYINT, false); PositionProvider position(&entry); _reader->seek(&position); diff --git a/be/test/olap/bloom_filter_test.cpp b/be/test/olap/bloom_filter_test.cpp index 36df30eb69..5a9a2c1b94 100644 --- a/be/test/olap/bloom_filter_test.cpp +++ b/be/test/olap/bloom_filter_test.cpp @@ -81,8 +81,8 @@ TEST_F(TestBloomFilter, add_and_test_bytes) { BloomFilter bf; bf.init(1024); - bf.add_bytes(NULL, 0); - ASSERT_TRUE(bf.test_bytes(NULL, 0)); + bf.add_bytes(nullptr, 0); + ASSERT_TRUE(bf.test_bytes(nullptr, 0)); bytes = "hello"; bf.add_bytes(bytes.c_str(), bytes.size()); diff --git a/be/test/olap/column_reader_test.cpp b/be/test/olap/column_reader_test.cpp old mode 100755 new mode 100644 index 8572ac9c1f..2c66b24856 --- a/be/test/olap/column_reader_test.cpp +++ b/be/test/olap/column_reader_test.cpp @@ -23,6 +23,9 @@ #include "olap/olap_define.h" #include "olap/olap_common.h" #include "olap/row_cursor.h" +#include "runtime/mem_pool.h" +#include "runtime/string_value.hpp" +#include "runtime/vectorized_row_batch.h" #include "util/logging.h" using std::string; @@ -48,6 +51,9 @@ public: _dictionary_buffers.clear(); _length_buffers.clear(); + + _mem_tracker.reset(new MemTracker(-1)); + _mem_pool.reset(new MemPool(_mem_tracker.get())); } virtual ~TestColumn() { @@ -114,8 +120,6 @@ public: included[0] = 0; UniqueIdToColumnIdMap segment_included; segment_included[0] = 0; - //UniqueIdSet segment_columns; - //segment_columns.insert(0); _column_reader = ColumnReader::create(0, tablet_schema, @@ -136,8 +140,9 @@ public: std::vector buffer_size; std::vector name; - for (std::map::const_iterator it = _stream_factory->streams().begin(); - it != _stream_factory->streams().end(); ++it) { + std::map::const_iterator it + = _stream_factory->streams().begin(); + for (; it != _stream_factory->streams().end(); ++it) { StreamName stream_name = it->first; OutStream *out_stream = it->second; std::vector *buffers; @@ -185,13 +190,18 @@ public: off[i], length[i], lzo_decompress, - buffer_size[i]); + buffer_size[i], + &_stats); ASSERT_EQ(OLAP_SUCCESS, in_stream->init()); _map_in_streams[name[i]] = in_stream; } - ASSERT_EQ(_column_reader->init(&_map_in_streams), OLAP_SUCCESS); + ASSERT_EQ(_column_reader->init( + &_map_in_streams, + 1024, + _mem_pool.get(), + &_stats), OLAP_SUCCESS); } void SetFieldInfo(FieldInfo &field_info, @@ -220,6 +230,9 @@ public: ColumnWriter *_column_writer; ColumnReader *_column_reader; + std::unique_ptr _mem_tracker; + std::unique_ptr _mem_pool; + std::unique_ptr _col_vector; OutStreamFactory *_stream_factory; @@ -240,14 +253,16 @@ public: std::map _map_in_streams; FileHandler helper; + + OlapReaderStatistics _stats; }; -TEST_F(TestColumn, TinyColumnWithoutPresent) { +TEST_F(TestColumn, VectorizedTinyColumnWithoutPresent) { // write data std::vector tablet_schema; FieldInfo field_info; SetFieldInfo(field_info, - std::string("TinyColumn"), + std::string("TinyColumn"), OLAP_FIELD_TYPE_TINYINT, OLAP_FIELD_AGGREGATION_REPLACE, 1, @@ -259,12 +274,13 @@ TEST_F(TestColumn, TinyColumnWithoutPresent) { RowCursor write_row; write_row.init(tablet_schema); + char value = 1; - write_row.read(&value, sizeof(value)); + write_row.set_field_content(0, &value, _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); value = 3; - write_row.read(&value, sizeof(value)); + write_row.set_field_content(0, &value, _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; @@ -276,17 +292,17 @@ TEST_F(TestColumn, TinyColumnWithoutPresent) { RowCursor read_row; read_row.init(tablet_schema); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(&value); + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 2, _mem_pool.get()), OLAP_SUCCESS); + char* data = reinterpret_cast(_col_vector->col_data()); + + value = *reinterpret_cast(data); ASSERT_EQ(value, 1); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(&value); - ASSERT_EQ(value, 3); - - ASSERT_NE(_column_reader->next(), OLAP_SUCCESS); + data++; + value = *reinterpret_cast(data); + ASSERT_EQ(value, 3); } TEST_F(TestColumn, SeekTinyColumnWithoutPresent) { @@ -306,18 +322,19 @@ TEST_F(TestColumn, SeekTinyColumnWithoutPresent) { RowCursor write_row; write_row.init(tablet_schema); + char value = 1; - write_row.read(&value, sizeof(value)); + write_row.set_field_content(0, &value, _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); value = 2; - write_row.read(&value, sizeof(value)); + write_row.set_field_content(0, &value, _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); create_and_save_last_position(); value = 3; - write_row.read(&value, sizeof(value)); + write_row.set_field_content(0, &value, _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); create_and_save_last_position(); @@ -334,32 +351,36 @@ TEST_F(TestColumn, SeekTinyColumnWithoutPresent) { PositionEntryReader entry1; entry1._positions = _column_writer->index()->mutable_entry(0)->_positions; entry1._positions_count = _column_writer->index()->mutable_entry(0)->_positions_count; - entry1._statistics.init(OLAP_FIELD_TYPE_NONE, false); + entry1._statistics.init(OLAP_FIELD_TYPE_TINYINT, false); PositionEntryReader entry2; entry2._positions = _column_writer->index()->mutable_entry(1)->_positions; entry2._positions_count = _column_writer->index()->mutable_entry(1)->_positions_count; - entry2._statistics.init(OLAP_FIELD_TYPE_NONE, false); + entry2._statistics.init(OLAP_FIELD_TYPE_TINYINT, false); PositionProvider position0(&entry1); PositionProvider position1(&entry2); ASSERT_EQ(_column_reader->seek(&position0), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(&value); - ASSERT_EQ(value, 1); + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 2, _mem_pool.get()), OLAP_SUCCESS); + char* data = reinterpret_cast(_col_vector->col_data()); + value = *reinterpret_cast(data); + ASSERT_EQ(value, 1); + data++; + value = *reinterpret_cast(data); + ASSERT_EQ(value, 2); ASSERT_EQ(_column_reader->seek(&position1), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(&value); - ASSERT_EQ(value, 3); - - ASSERT_NE(_column_reader->next(), OLAP_SUCCESS); + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 1, _mem_pool.get()), OLAP_SUCCESS); + data = reinterpret_cast(_col_vector->col_data()); + value = *reinterpret_cast(data); + ASSERT_EQ(value, 3); } - TEST_F(TestColumn, SkipTinyColumnWithoutPresent) { // write data std::vector tablet_schema; @@ -378,15 +399,15 @@ TEST_F(TestColumn, SkipTinyColumnWithoutPresent) { RowCursor write_row; write_row.init(tablet_schema); char value = 1; - write_row.read(&value, sizeof(value)); + write_row.set_field_content(0, &value, _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); value = 2; - write_row.read(&value, sizeof(value)); + write_row.set_field_content(0, &value, _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); value = 3; - write_row.read(&value, sizeof(value)); + write_row.set_field_content(0, &value, _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; @@ -399,16 +420,15 @@ TEST_F(TestColumn, SkipTinyColumnWithoutPresent) { read_row.init(tablet_schema); ASSERT_EQ(_column_reader->skip(2), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(&value); - ASSERT_EQ(value, 3); - - ASSERT_NE(_column_reader->next(), OLAP_SUCCESS); + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 1, _mem_pool.get()), OLAP_SUCCESS); + char* data = reinterpret_cast(_col_vector->col_data()); + value = *reinterpret_cast(data); + ASSERT_EQ(value, 3); } - -TEST_F(TestColumn, TinyColumnWithPresent) { +TEST_F(TestColumn, VectorizedTinyColumnWithPresent) { // write data std::vector tablet_schema; FieldInfo field_info; @@ -425,12 +445,12 @@ TEST_F(TestColumn, TinyColumnWithPresent) { RowCursor write_row; write_row.init(tablet_schema); - char value = 1; - write_row.read(&value, sizeof(value)); + write_row.set_null(0); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); - value = 3; - write_row.read(&value, sizeof(value)); + write_row.set_not_null(0); + char value = 3; + write_row.set_field_content(0, &value, _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; @@ -442,15 +462,15 @@ TEST_F(TestColumn, TinyColumnWithPresent) { RowCursor read_row; read_row.init(tablet_schema); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(&value); - ASSERT_EQ(value, 1); + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 2, _mem_pool.get()), OLAP_SUCCESS); + bool* is_null = _col_vector->is_null(); + ASSERT_EQ(is_null[0], true); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(&value); - ASSERT_EQ(value, 3); + char* data = reinterpret_cast(_col_vector->col_data()); + ASSERT_EQ(is_null[1], false); + value = *reinterpret_cast(data + 1); } TEST_F(TestColumn, TinyColumnIndex) { @@ -471,11 +491,11 @@ TEST_F(TestColumn, TinyColumnIndex) { RowCursor write_row; write_row.init(tablet_schema); char value = 1; - write_row.read(&value, sizeof(value)); + write_row.set_field_content(0, &value, _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); value = 3; - write_row.read(&value, sizeof(value)); + write_row.set_field_content(0, &value, _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; @@ -487,14 +507,14 @@ TEST_F(TestColumn, TinyColumnIndex) { RowCursor read_row; read_row.init(tablet_schema); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(&value); + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 2, _mem_pool.get()), OLAP_SUCCESS); + char* data = reinterpret_cast(_col_vector->col_data()); + value = *reinterpret_cast(data); ASSERT_EQ(value, 1); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(&value); + value = *reinterpret_cast(data + 1); ASSERT_EQ(value, 3); } @@ -516,17 +536,17 @@ TEST_F(TestColumn, SeekTinyColumnWithPresent) { RowCursor write_row; write_row.init(tablet_schema); char value = 1; - write_row.read(&value, sizeof(value)); + write_row.set_field_content(0, &value, _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); value = 2; - write_row.read(&value, sizeof(value)); + write_row.set_field_content(0, &value, _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); create_and_save_last_position(); value = 3; - write_row.read(&value, sizeof(value)); + write_row.set_field_content(0, &value, _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); create_and_save_last_position(); @@ -543,29 +563,33 @@ TEST_F(TestColumn, SeekTinyColumnWithPresent) { PositionEntryReader entry1; entry1._positions = _column_writer->index()->mutable_entry(0)->_positions; entry1._positions_count = _column_writer->index()->mutable_entry(0)->_positions_count; - entry1._statistics.init(OLAP_FIELD_TYPE_NONE, false); + entry1._statistics.init(OLAP_FIELD_TYPE_TINYINT, false); PositionEntryReader entry2; entry2._positions = _column_writer->index()->mutable_entry(1)->_positions; entry2._positions_count = _column_writer->index()->mutable_entry(1)->_positions_count; - entry2._statistics.init(OLAP_FIELD_TYPE_NONE, false); + entry2._statistics.init(OLAP_FIELD_TYPE_TINYINT, false); PositionProvider position1(&entry1); PositionProvider position2(&entry2); ASSERT_EQ(_column_reader->seek(&position1), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(&value); - ASSERT_EQ(value, 1); + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 2, _mem_pool.get()), OLAP_SUCCESS); + char* data = reinterpret_cast(_col_vector->col_data()); + value = *reinterpret_cast(data); + ASSERT_EQ(value, 1); + value = *reinterpret_cast(data + 1); + ASSERT_EQ(value, 2); ASSERT_EQ(_column_reader->seek(&position2), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(&value); + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 1, _mem_pool.get()), OLAP_SUCCESS); + data = reinterpret_cast(_col_vector->col_data()); + value = *reinterpret_cast(data); ASSERT_EQ(value, 3); - - ASSERT_NE(_column_reader->next(), OLAP_SUCCESS); } TEST_F(TestColumn, SkipTinyColumnWithPresent) { @@ -586,15 +610,15 @@ TEST_F(TestColumn, SkipTinyColumnWithPresent) { RowCursor write_row; write_row.init(tablet_schema); char value = 1; - write_row.read(&value, sizeof(value)); + write_row.set_field_content(0, &value, _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); value = 2; - write_row.read(&value, sizeof(value)); + write_row.set_field_content(0, &value, _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); value = 3; - write_row.read(&value, sizeof(value)); + write_row.set_field_content(0, &value, _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; @@ -607,15 +631,15 @@ TEST_F(TestColumn, SkipTinyColumnWithPresent) { read_row.init(tablet_schema); ASSERT_EQ(_column_reader->skip(2), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(&value); + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 1, _mem_pool.get()), OLAP_SUCCESS); + char* data = reinterpret_cast(_col_vector->col_data()); + value = *reinterpret_cast(data); ASSERT_EQ(value, 3); - - ASSERT_NE(_column_reader->next(), OLAP_SUCCESS); } -TEST_F(TestColumn, ShortColumnWithoutPresent) { +TEST_F(TestColumn, VectorizedShortColumnWithoutPresent) { // write data std::vector tablet_schema; FieldInfo field_info; @@ -633,11 +657,11 @@ TEST_F(TestColumn, ShortColumnWithoutPresent) { RowCursor write_row; write_row.init(tablet_schema); int16_t value = 1; - write_row.read(reinterpret_cast(&value), sizeof(value)); + write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); value = 3; - write_row.read(reinterpret_cast(&value), sizeof(value)); + write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; @@ -649,15 +673,15 @@ TEST_F(TestColumn, ShortColumnWithoutPresent) { RowCursor read_row; read_row.init(tablet_schema); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(reinterpret_cast(&value)); + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 2, _mem_pool.get()), OLAP_SUCCESS); + char* data = reinterpret_cast(_col_vector->col_data()); + value = *reinterpret_cast(data); ASSERT_EQ(value, 1); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(reinterpret_cast(&value)); - ASSERT_EQ(value, 3); + value = *reinterpret_cast(data + sizeof(int16_t)); + ASSERT_EQ(value, 3); } TEST_F(TestColumn, SeekShortColumnWithoutPresent) { @@ -678,17 +702,17 @@ TEST_F(TestColumn, SeekShortColumnWithoutPresent) { RowCursor write_row; write_row.init(tablet_schema); int16_t value = 1; - write_row.read(reinterpret_cast(&value), sizeof(value)); + write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); value = 2; - write_row.read(reinterpret_cast(&value), sizeof(value)); + write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); create_and_save_last_position(); value = 3; - write_row.read(reinterpret_cast(&value), sizeof(value)); + write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); create_and_save_last_position(); @@ -705,26 +729,33 @@ TEST_F(TestColumn, SeekShortColumnWithoutPresent) { PositionEntryReader entry1; entry1._positions = _column_writer->index()->mutable_entry(0)->_positions; entry1._positions_count = _column_writer->index()->mutable_entry(0)->_positions_count; - entry1._statistics.init(OLAP_FIELD_TYPE_NONE, false); + entry1._statistics.init(OLAP_FIELD_TYPE_SMALLINT, false); PositionEntryReader entry2; entry2._positions = _column_writer->index()->mutable_entry(1)->_positions; entry2._positions_count = _column_writer->index()->mutable_entry(1)->_positions_count; - entry2._statistics.init(OLAP_FIELD_TYPE_NONE, false); + entry2._statistics.init(OLAP_FIELD_TYPE_SMALLINT, false); PositionProvider position0(&entry1); PositionProvider position1(&entry2); ASSERT_EQ(_column_reader->seek(&position0), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(reinterpret_cast(&value)); - ASSERT_EQ(value, 1); + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 2, _mem_pool.get()), OLAP_SUCCESS); + char* data = reinterpret_cast(_col_vector->col_data()); + value = *reinterpret_cast(data); + ASSERT_EQ(value, 1); + + value = *reinterpret_cast(data + sizeof(int16_t)); + ASSERT_EQ(value, 2); ASSERT_EQ(_column_reader->seek(&position1), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(reinterpret_cast(&value)); + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 1, _mem_pool.get()), OLAP_SUCCESS); + data = reinterpret_cast(_col_vector->col_data()); + value = *reinterpret_cast(data); ASSERT_EQ(value, 3); } @@ -746,15 +777,15 @@ TEST_F(TestColumn, SkipShortColumnWithoutPresent) { RowCursor write_row; write_row.init(tablet_schema); int16_t value = 1; - write_row.read(reinterpret_cast(&value), sizeof(value)); + write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); value = 2; - write_row.read(reinterpret_cast(&value), sizeof(value)); + write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); value = 3; - write_row.read(reinterpret_cast(&value), sizeof(value)); + write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; @@ -767,57 +798,14 @@ TEST_F(TestColumn, SkipShortColumnWithoutPresent) { read_row.init(tablet_schema); ASSERT_EQ(_column_reader->skip(2), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(reinterpret_cast(&value)); + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 1, _mem_pool.get()), OLAP_SUCCESS); + char* data = reinterpret_cast(_col_vector->col_data()); + value = *reinterpret_cast(data); ASSERT_EQ(value, 3); } -TEST_F(TestColumn, ShortColumnWithPresent) { - // write data - std::vector tablet_schema; - FieldInfo field_info; - SetFieldInfo(field_info, - std::string("ShortColumn"), - OLAP_FIELD_TYPE_SMALLINT, - OLAP_FIELD_AGGREGATION_REPLACE, - 2, - true, - true); - tablet_schema.push_back(field_info); - - CreateColumnWriter(tablet_schema); - - RowCursor write_row; - write_row.init(tablet_schema); - int16_t value = 1; - write_row.read(reinterpret_cast(&value), sizeof(value)); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); - - value = 3; - write_row.read(reinterpret_cast(&value), sizeof(value)); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); - - ColumnDataHeaderMessage header; - ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); - - // read data - CreateColumnReader(tablet_schema); - - RowCursor read_row; - read_row.init(tablet_schema); - - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(reinterpret_cast(&value)); - ASSERT_EQ(value, 1); - - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(reinterpret_cast(&value)); - ASSERT_EQ(value, 3); -} - TEST_F(TestColumn, SeekShortColumnWithPresent) { // write data std::vector tablet_schema; @@ -836,17 +824,17 @@ TEST_F(TestColumn, SeekShortColumnWithPresent) { RowCursor write_row; write_row.init(tablet_schema); int16_t value = 1; - write_row.read(reinterpret_cast(&value), sizeof(value)); + write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); value = 2; - write_row.read(reinterpret_cast(&value), sizeof(value)); + write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); create_and_save_last_position(); value = 3; - write_row.read(reinterpret_cast(&value), sizeof(value)); + write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); create_and_save_last_position(); @@ -863,29 +851,81 @@ TEST_F(TestColumn, SeekShortColumnWithPresent) { PositionEntryReader entry1; entry1._positions = _column_writer->index()->mutable_entry(0)->_positions; entry1._positions_count = _column_writer->index()->mutable_entry(0)->_positions_count; - entry1._statistics.init(OLAP_FIELD_TYPE_NONE, false); + entry1._statistics.init(OLAP_FIELD_TYPE_SMALLINT, false); PositionEntryReader entry2; entry2._positions = _column_writer->index()->mutable_entry(1)->_positions; entry2._positions_count = _column_writer->index()->mutable_entry(1)->_positions_count; - entry2._statistics.init(OLAP_FIELD_TYPE_NONE, false); + entry2._statistics.init(OLAP_FIELD_TYPE_SMALLINT, false); PositionProvider position0(&entry1); PositionProvider position1(&entry2); ASSERT_EQ(_column_reader->seek(&position0), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(reinterpret_cast(&value)); - ASSERT_EQ(value, 1); + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 1, _mem_pool.get()), OLAP_SUCCESS); + char* data = reinterpret_cast(_col_vector->col_data()); + value = *reinterpret_cast(data); + ASSERT_EQ(value, 1); ASSERT_EQ(_column_reader->seek(&position1), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(reinterpret_cast(&value)); + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 1, _mem_pool.get()), OLAP_SUCCESS); + data = reinterpret_cast(_col_vector->col_data()); + value = *reinterpret_cast(data); ASSERT_EQ(value, 3); } +TEST_F(TestColumn, VectorizedShortColumnWithPresent) { + // write data + std::vector tablet_schema; + FieldInfo field_info; + SetFieldInfo(field_info, + std::string("ShortColumn"), + OLAP_FIELD_TYPE_SMALLINT, + OLAP_FIELD_AGGREGATION_REPLACE, + 2, + true, + true); + tablet_schema.push_back(field_info); + + CreateColumnWriter(tablet_schema); + + RowCursor write_row; + write_row.init(tablet_schema); + + write_row.set_null(0); + ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + + int16_t value = 3; + write_row.set_not_null(0); + write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); + ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + + ColumnDataHeaderMessage header; + ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); + + // read data + CreateColumnReader(tablet_schema); + + RowCursor read_row; + read_row.init(tablet_schema); + + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 2, _mem_pool.get()), OLAP_SUCCESS); + bool* is_null = _col_vector->is_null(); + ASSERT_EQ(is_null[0], true); + + char* data = reinterpret_cast(_col_vector->col_data()); + ASSERT_EQ(is_null[1], false); + + value = *reinterpret_cast(data + sizeof(int16_t)); + ASSERT_EQ(value, 3); +} + TEST_F(TestColumn, SkipShortColumnWithPresent) { // write data std::vector tablet_schema; @@ -904,15 +944,15 @@ TEST_F(TestColumn, SkipShortColumnWithPresent) { RowCursor write_row; write_row.init(tablet_schema); int16_t value = 1; - write_row.read(reinterpret_cast(&value), sizeof(value)); + write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); value = 2; - write_row.read(reinterpret_cast(&value), sizeof(value)); + write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); value = 3; - write_row.read(reinterpret_cast(&value), sizeof(value)); + write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; @@ -925,109 +965,20 @@ TEST_F(TestColumn, SkipShortColumnWithPresent) { read_row.init(tablet_schema); ASSERT_EQ(_column_reader->skip(2), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(reinterpret_cast(&value)); + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 1, _mem_pool.get()), OLAP_SUCCESS); + char* data = reinterpret_cast(_col_vector->col_data()); + value = *reinterpret_cast(data); ASSERT_EQ(value, 3); } - -TEST_F(TestColumn, UnsignedShortColumnWithoutPresent) { +TEST_F(TestColumn, VectorizedIntColumnWithoutPresent) { // write data std::vector tablet_schema; FieldInfo field_info; SetFieldInfo(field_info, - std::string("UnsignedShortColumn"), - OLAP_FIELD_TYPE_UNSIGNED_SMALLINT, - OLAP_FIELD_AGGREGATION_REPLACE, - 2, - false, - true); - tablet_schema.push_back(field_info); - - CreateColumnWriter(tablet_schema); - - RowCursor write_row; - write_row.init(tablet_schema); - uint16_t value = 1; - write_row.read(reinterpret_cast(&value), sizeof(value)); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); - - value = 3; - write_row.read(reinterpret_cast(&value), sizeof(value)); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); - - ColumnDataHeaderMessage header; - ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); - - // read data - CreateColumnReader(tablet_schema); - - RowCursor read_row; - read_row.init(tablet_schema); - - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(reinterpret_cast(&value)); - ASSERT_EQ(value, 1); - - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(reinterpret_cast(&value)); - ASSERT_EQ(value, 3); -} - -TEST_F(TestColumn, UnsignedShortColumnWithPresent) { - // write data - std::vector tablet_schema; - FieldInfo field_info; - SetFieldInfo(field_info, - std::string("UnsignedShortColumn"), - OLAP_FIELD_TYPE_UNSIGNED_SMALLINT, - OLAP_FIELD_AGGREGATION_REPLACE, - 2, - true, - true); - tablet_schema.push_back(field_info); - - CreateColumnWriter(tablet_schema); - - RowCursor write_row; - write_row.init(tablet_schema); - uint16_t value = 1; - write_row.read(reinterpret_cast(&value), sizeof(value)); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); - - value = 3; - write_row.read(reinterpret_cast(&value), sizeof(value)); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); - - ColumnDataHeaderMessage header; - ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); - - // read data - CreateColumnReader(tablet_schema); - - RowCursor read_row; - read_row.init(tablet_schema); - - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(reinterpret_cast(&value)); - ASSERT_EQ(value, 1); - - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(reinterpret_cast(&value)); - ASSERT_EQ(value, 3); -} - -TEST_F(TestColumn, IntColumnWithoutPresent) { - // write data - std::vector tablet_schema; - FieldInfo field_info; - SetFieldInfo(field_info, - std::string("UnsignedShortColumn"), + std::string("IntColumn"), OLAP_FIELD_TYPE_INT, OLAP_FIELD_AGGREGATION_REPLACE, 4, @@ -1040,11 +991,11 @@ TEST_F(TestColumn, IntColumnWithoutPresent) { RowCursor write_row; write_row.init(tablet_schema); int32_t value = 1; - write_row.read(reinterpret_cast(&value), sizeof(value)); + write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); value = 3; - write_row.read(reinterpret_cast(&value), sizeof(value)); + write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; @@ -1056,24 +1007,23 @@ TEST_F(TestColumn, IntColumnWithoutPresent) { RowCursor read_row; read_row.init(tablet_schema); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(reinterpret_cast(&value)); + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 2, _mem_pool.get()), OLAP_SUCCESS); + char* data = reinterpret_cast(_col_vector->col_data()); + value = *reinterpret_cast(data); ASSERT_EQ(value, 1); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(reinterpret_cast(&value)); + value = *reinterpret_cast(data + sizeof(int)); ASSERT_EQ(value, 3); } - -TEST_F(TestColumn, IntColumnMassWithoutPresent) { +TEST_F(TestColumn, VectorizedIntColumnMassWithoutPresent) { // write data std::vector tablet_schema; FieldInfo field_info; SetFieldInfo(field_info, - std::string("UnsignedShortColumn"), + std::string("IntColumn"), OLAP_FIELD_TYPE_INT, OLAP_FIELD_AGGREGATION_REPLACE, 4, @@ -1087,7 +1037,7 @@ TEST_F(TestColumn, IntColumnMassWithoutPresent) { write_row.init(tablet_schema); for (int32_t i = 0; i < 10000; i++) { - write_row.read(reinterpret_cast(&i), sizeof(i)); + write_row.set_field_content(0, reinterpret_cast(&i), _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); } @@ -1100,22 +1050,29 @@ TEST_F(TestColumn, IntColumnMassWithoutPresent) { RowCursor read_row; read_row.init(tablet_schema); - for (int32_t i = 0; i < 10000; i++) { - int32_t value; - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(reinterpret_cast(&value)); + _col_vector.reset(new ColumnVector()); + + char* data = NULL; + for (int32_t i = 0; i < 10000; ++i) { + if (i % 1000 == 0) { + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 1000, _mem_pool.get()), OLAP_SUCCESS); + data = reinterpret_cast(_col_vector->col_data()); + } + + int32_t value = 0; + value = *reinterpret_cast(data); ASSERT_EQ(value, i); + data += sizeof(int32_t); } } - -TEST_F(TestColumn, IntColumnWithPresent) { +TEST_F(TestColumn, VectorizedIntColumnWithPresent) { // write data std::vector tablet_schema; FieldInfo field_info; SetFieldInfo(field_info, - std::string("UnsignedShortColumn"), + std::string("IntColumn"), OLAP_FIELD_TYPE_INT, OLAP_FIELD_AGGREGATION_REPLACE, 4, @@ -1128,11 +1085,10 @@ TEST_F(TestColumn, IntColumnWithPresent) { RowCursor write_row; write_row.init(tablet_schema); int32_t value = -1; - write_row.read(reinterpret_cast(&value), sizeof(value)); + write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); - value = 3; - write_row.read(reinterpret_cast(&value), sizeof(value)); + write_row.set_null(0); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; @@ -1144,108 +1100,21 @@ TEST_F(TestColumn, IntColumnWithPresent) { RowCursor read_row; read_row.init(tablet_schema); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(reinterpret_cast(&value)); + _col_vector.reset(new ColumnVector()); + + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 2, _mem_pool.get()), OLAP_SUCCESS); + + bool* is_null = _col_vector->is_null(); + char* data = reinterpret_cast(_col_vector->col_data()); + ASSERT_EQ(is_null[0], false); + value = *reinterpret_cast(data); ASSERT_EQ(value, -1); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(reinterpret_cast(&value)); - ASSERT_EQ(value, 3); + ASSERT_EQ(is_null[1], true); } -TEST_F(TestColumn, UnsignedIntColumnWithoutPresent) { - // write data - std::vector tablet_schema; - FieldInfo field_info; - SetFieldInfo(field_info, - std::string("UnsignedShortColumn"), - OLAP_FIELD_TYPE_UNSIGNED_INT, - OLAP_FIELD_AGGREGATION_REPLACE, - 4, - false, - true); - tablet_schema.push_back(field_info); - - CreateColumnWriter(tablet_schema); - - RowCursor write_row; - write_row.init(tablet_schema); - uint32_t value = 1; - write_row.read(reinterpret_cast(&value), sizeof(value)); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); - - value = 3; - write_row.read(reinterpret_cast(&value), sizeof(value)); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); - - ColumnDataHeaderMessage header; - ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); - - // read data - CreateColumnReader(tablet_schema); - - RowCursor read_row; - read_row.init(tablet_schema); - - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(reinterpret_cast(&value)); - ASSERT_EQ(value, 1); - - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(reinterpret_cast(&value)); - ASSERT_EQ(value, 3); -} - -TEST_F(TestColumn, UnsignedIntColumnWithPresent) { - // write data - std::vector tablet_schema; - FieldInfo field_info; - SetFieldInfo(field_info, - std::string("UnsignedShortColumn"), - OLAP_FIELD_TYPE_UNSIGNED_INT, - OLAP_FIELD_AGGREGATION_REPLACE, - 4, - true, - true); - tablet_schema.push_back(field_info); - - CreateColumnWriter(tablet_schema); - - RowCursor write_row; - write_row.init(tablet_schema); - uint32_t value = 1; - write_row.read(reinterpret_cast(&value), sizeof(value)); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); - - value = 3; - write_row.read(reinterpret_cast(&value), sizeof(value)); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); - - ColumnDataHeaderMessage header; - ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); - - // read data - CreateColumnReader(tablet_schema); - - RowCursor read_row; - read_row.init(tablet_schema); - - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(reinterpret_cast(&value)); - ASSERT_EQ(value, 1); - - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(reinterpret_cast(&value)); - ASSERT_EQ(value, 3); -} - -TEST_F(TestColumn, LongColumnWithoutPresent) { +TEST_F(TestColumn, VectorizedLongColumnWithoutPresent) { // write data std::vector tablet_schema; FieldInfo field_info; @@ -1263,11 +1132,11 @@ TEST_F(TestColumn, LongColumnWithoutPresent) { RowCursor write_row; write_row.init(tablet_schema); int64_t value = 1; - write_row.read(reinterpret_cast(&value), sizeof(value)); + write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); value = 3; - write_row.read(reinterpret_cast(&value), sizeof(value)); + write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; @@ -1279,18 +1148,18 @@ TEST_F(TestColumn, LongColumnWithoutPresent) { RowCursor read_row; read_row.init(tablet_schema); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(reinterpret_cast(&value)); + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 2, _mem_pool.get()), OLAP_SUCCESS); + char* data = reinterpret_cast(_col_vector->col_data()); + value = *reinterpret_cast(data); ASSERT_EQ(value, 1); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(reinterpret_cast(&value)); - ASSERT_EQ(value, 3); + value = *reinterpret_cast(data + sizeof(int64_t)); + ASSERT_EQ(value, 3); } -TEST_F(TestColumn, LongColumnWithPresent) { +TEST_F(TestColumn, VectorizedLongColumnWithPresent) { // write data std::vector tablet_schema; FieldInfo field_info; @@ -1307,12 +1176,12 @@ TEST_F(TestColumn, LongColumnWithPresent) { RowCursor write_row; write_row.init(tablet_schema); - int64_t value = 1; - write_row.read(reinterpret_cast(&value), sizeof(value)); + write_row.set_null(0); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); - value = 3; - write_row.read(reinterpret_cast(&value), sizeof(value)); + int64_t value = 3; + write_row.set_not_null(0); + write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; @@ -1324,113 +1193,25 @@ TEST_F(TestColumn, LongColumnWithPresent) { RowCursor read_row; read_row.init(tablet_schema); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(reinterpret_cast(&value)); - ASSERT_EQ(value, 1); + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 2, _mem_pool.get()), OLAP_SUCCESS); + bool* is_null = _col_vector->is_null(); + ASSERT_EQ(is_null[0], true); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(reinterpret_cast(&value)); - ASSERT_EQ(value, 3); + char* data = reinterpret_cast(_col_vector->col_data()); + ASSERT_EQ(is_null[1], false); + + value = *reinterpret_cast(data + sizeof(int64_t)); + ASSERT_EQ(value, 3); } -TEST_F(TestColumn, UnsignedLongColumnWithoutPresent) { +TEST_F(TestColumn, VectorizedFloatColumnWithoutPresent) { // write data std::vector tablet_schema; FieldInfo field_info; SetFieldInfo(field_info, - std::string("UnsignedLongColumnWithoutPresent"), - OLAP_FIELD_TYPE_UNSIGNED_BIGINT, - OLAP_FIELD_AGGREGATION_REPLACE, - 8, - false, - true); - tablet_schema.push_back(field_info); - - CreateColumnWriter(tablet_schema); - - RowCursor write_row; - write_row.init(tablet_schema); - uint64_t value = 1; - write_row.read(reinterpret_cast(&value), sizeof(value)); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); - - value = 3; - write_row.read(reinterpret_cast(&value), sizeof(value)); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); - - ColumnDataHeaderMessage header; - ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); - - // read data - CreateColumnReader(tablet_schema); - - RowCursor read_row; - read_row.init(tablet_schema); - - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(reinterpret_cast(&value)); - ASSERT_EQ(value, 1); - - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(reinterpret_cast(&value)); - ASSERT_EQ(value, 3); -} - -TEST_F(TestColumn, UnsignedLongColumnWithPresent) { - // write data - std::vector tablet_schema; - FieldInfo field_info; - SetFieldInfo(field_info, - std::string("UnsignedLongColumnWithoutPresent"), - OLAP_FIELD_TYPE_UNSIGNED_BIGINT, - OLAP_FIELD_AGGREGATION_REPLACE, - 8, - true, - true); - tablet_schema.push_back(field_info); - - CreateColumnWriter(tablet_schema); - - RowCursor write_row; - write_row.init(tablet_schema); - uint64_t value = 1; - write_row.read(reinterpret_cast(&value), sizeof(value)); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); - - value = 3; - write_row.read(reinterpret_cast(&value), sizeof(value)); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); - - ColumnDataHeaderMessage header; - ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); - - // read data - CreateColumnReader(tablet_schema); - - RowCursor read_row; - read_row.init(tablet_schema); - - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(reinterpret_cast(&value)); - ASSERT_EQ(value, 1); - - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(reinterpret_cast(&value)); - ASSERT_EQ(value, 3); -} - -TEST_F(TestColumn, FloatColumnWithoutPresent) { - // write data - std::vector tablet_schema; - FieldInfo field_info; - SetFieldInfo(field_info, - std::string("UnsignedLongColumnWithoutPresent"), + std::string("FloatColumnWithoutPresent"), OLAP_FIELD_TYPE_FLOAT, OLAP_FIELD_AGGREGATION_REPLACE, 4, @@ -1443,11 +1224,11 @@ TEST_F(TestColumn, FloatColumnWithoutPresent) { RowCursor write_row; write_row.init(tablet_schema); float value = 1.234; - write_row.read(reinterpret_cast(&value), sizeof(value)); + write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); value = 3.234; - write_row.read(reinterpret_cast(&value), sizeof(value)); + write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; @@ -1459,19 +1240,19 @@ TEST_F(TestColumn, FloatColumnWithoutPresent) { RowCursor read_row; read_row.init(tablet_schema); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(reinterpret_cast(&value)); + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 2, _mem_pool.get()), OLAP_SUCCESS); + char* data = reinterpret_cast(_col_vector->col_data()); + value = *reinterpret_cast(data); ASSERT_FLOAT_EQ(value, 1.234); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(reinterpret_cast(&value)); - ASSERT_FLOAT_EQ(value, 3.234); - + data += sizeof(float); + value = *reinterpret_cast(data); + ASSERT_FLOAT_EQ(value, 3.234); } -TEST_F(TestColumn, FloatColumnWithPresent) { +TEST_F(TestColumn, VectorizedFloatColumnWithPresent) { // write data std::vector tablet_schema; FieldInfo field_info; @@ -1488,12 +1269,12 @@ TEST_F(TestColumn, FloatColumnWithPresent) { RowCursor write_row; write_row.init(tablet_schema); - float value = 1.234; - write_row.read(reinterpret_cast(&value), sizeof(value)); + write_row.set_null(0); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); - value = 3.234; - write_row.read(reinterpret_cast(&value), sizeof(value)); + float value = 3.234; + write_row.set_not_null(0); + write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; @@ -1505,16 +1286,17 @@ TEST_F(TestColumn, FloatColumnWithPresent) { RowCursor read_row; read_row.init(tablet_schema); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(reinterpret_cast(&value)); - ASSERT_FLOAT_EQ(value, 1.234); + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 2, _mem_pool.get()), OLAP_SUCCESS); + bool* is_null = _col_vector->is_null(); + ASSERT_EQ(is_null[0], true); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(reinterpret_cast(&value)); - ASSERT_FLOAT_EQ(value, 3.234); - + ASSERT_EQ(is_null[1], false); + + char* data = reinterpret_cast(_col_vector->col_data()) + sizeof(float); + value = *reinterpret_cast(data); + ASSERT_FLOAT_EQ(value, 3.234); } TEST_F(TestColumn, SeekFloatColumnWithPresent) { @@ -1535,13 +1317,13 @@ TEST_F(TestColumn, SeekFloatColumnWithPresent) { RowCursor write_row; write_row.init(tablet_schema); float value = 1.234; - write_row.read(reinterpret_cast(&value), sizeof(value)); + write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); create_and_save_last_position(); value = 3.234; - write_row.read(reinterpret_cast(&value), sizeof(value)); + write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); create_and_save_last_position(); @@ -1558,26 +1340,25 @@ TEST_F(TestColumn, SeekFloatColumnWithPresent) { PositionEntryReader entry1; entry1._positions = _column_writer->index()->mutable_entry(0)->_positions; entry1._positions_count = _column_writer->index()->mutable_entry(0)->_positions_count; - entry1._statistics.init(OLAP_FIELD_TYPE_NONE, false); + entry1._statistics.init(OLAP_FIELD_TYPE_FLOAT, false); PositionEntryReader entry2; entry2._positions = _column_writer->index()->mutable_entry(1)->_positions; entry2._positions_count = _column_writer->index()->mutable_entry(1)->_positions_count; - entry2._statistics.init(OLAP_FIELD_TYPE_NONE, false); + entry2._statistics.init(OLAP_FIELD_TYPE_FLOAT, false); PositionProvider position0(&entry1); PositionProvider position1(&entry2); ASSERT_EQ(_column_reader->seek(&position0), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(reinterpret_cast(&value)); + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 2, _mem_pool.get()), OLAP_SUCCESS); + char* data = reinterpret_cast(_col_vector->col_data()); + value = *reinterpret_cast(data); ASSERT_FLOAT_EQ(value, 1.234); - ASSERT_EQ(_column_reader->seek(&position1), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(reinterpret_cast(&value)); + value = *reinterpret_cast(data + sizeof(float)); ASSERT_FLOAT_EQ(value, 3.234); } @@ -1599,11 +1380,11 @@ TEST_F(TestColumn, SkipFloatColumnWithPresent) { RowCursor write_row; write_row.init(tablet_schema); float value = 1.234; - write_row.read(reinterpret_cast(&value), sizeof(value)); + write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); value = 3.234; - write_row.read(reinterpret_cast(&value), sizeof(value)); + write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; @@ -1616,13 +1397,15 @@ TEST_F(TestColumn, SkipFloatColumnWithPresent) { read_row.init(tablet_schema); ASSERT_EQ(_column_reader->skip(1), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(reinterpret_cast(&value)); + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 1, _mem_pool.get()), OLAP_SUCCESS); + char* data = reinterpret_cast(_col_vector->col_data()); + value = *reinterpret_cast(data); ASSERT_FLOAT_EQ(value, 3.234); } -TEST_F(TestColumn, DoubleColumnWithoutPresent) { +TEST_F(TestColumn, VectorizedDoubleColumnWithoutPresent) { // write data std::vector tablet_schema; FieldInfo field_info; @@ -1640,11 +1423,11 @@ TEST_F(TestColumn, DoubleColumnWithoutPresent) { RowCursor write_row; write_row.init(tablet_schema); double value = 1.23456789; - write_row.read(reinterpret_cast(&value), sizeof(value)); + write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); value = 3.23456789; - write_row.read(reinterpret_cast(&value), sizeof(value)); + write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; @@ -1656,19 +1439,19 @@ TEST_F(TestColumn, DoubleColumnWithoutPresent) { RowCursor read_row; read_row.init(tablet_schema); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(reinterpret_cast(&value)); + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 2, _mem_pool.get()), OLAP_SUCCESS); + char* data = reinterpret_cast(_col_vector->col_data()); + value = *reinterpret_cast(data); ASSERT_DOUBLE_EQ(value, 1.23456789); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(reinterpret_cast(&value)); - ASSERT_DOUBLE_EQ(value, 3.23456789); - + data += sizeof(double); + value = *reinterpret_cast(data); + ASSERT_DOUBLE_EQ(value, 3.23456789); } -TEST_F(TestColumn, DoubleColumnWithPresent) { +TEST_F(TestColumn, VectorizedDoubleColumnWithPresent) { // write data std::vector tablet_schema; FieldInfo field_info; @@ -1685,12 +1468,12 @@ TEST_F(TestColumn, DoubleColumnWithPresent) { RowCursor write_row; write_row.init(tablet_schema); - double value = 1.23456789; - write_row.read(reinterpret_cast(&value), sizeof(value)); + write_row.set_null(0); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); - value = 3.23456789; - write_row.read(reinterpret_cast(&value), sizeof(value)); + double value = 3.23456789; + write_row.set_not_null(0); + write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; @@ -1702,227 +1485,21 @@ TEST_F(TestColumn, DoubleColumnWithPresent) { RowCursor read_row; read_row.init(tablet_schema); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(reinterpret_cast(&value)); - ASSERT_DOUBLE_EQ(value, 1.23456789); + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 2, _mem_pool.get()), OLAP_SUCCESS); + bool* is_null = _col_vector->is_null(); + ASSERT_EQ(is_null[0], true); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - read_row.write(reinterpret_cast(&value)); - ASSERT_DOUBLE_EQ(value, 3.23456789); - + char* data = reinterpret_cast(_col_vector->col_data()); + ASSERT_EQ(is_null[1], false); + + data += sizeof(double); + value = *reinterpret_cast(data); + ASSERT_DOUBLE_EQ(value, 3.23456789); } -TEST_F(TestColumn, DiscreteDoubleColumnWithoutPresent) { - // write data - std::vector tablet_schema; - FieldInfo field_info; - SetFieldInfo(field_info, - std::string("DoubleColumnWithoutPresent"), - OLAP_FIELD_TYPE_DISCRETE_DOUBLE, - OLAP_FIELD_AGGREGATION_REPLACE, - 8, - false, - true); - tablet_schema.push_back(field_info); - - CreateColumnWriter(tablet_schema); - - RowCursor write_row; - write_row.init(tablet_schema); - - std::vector val_string_array; - val_string_array.push_back("1234.5678"); - write_row.from_string(val_string_array); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); - - ColumnDataHeaderMessage header; - ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); - - // read data - CreateColumnReader(tablet_schema); - - RowCursor read_row; - read_row.init(tablet_schema); - - char read_value[20]; - memset(read_value, 0, 20); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - ASSERT_TRUE(strncmp(read_row.to_string().c_str(), "0&1234.5678", strlen("0&1234.5678")) == 0); - -} - -TEST_F(TestColumn, DiscreteDoubleColumnWithPresent) { - // write data - std::vector tablet_schema; - FieldInfo field_info; - SetFieldInfo(field_info, - std::string("DiscreteDoubleColumnWithPresent"), - OLAP_FIELD_TYPE_DISCRETE_DOUBLE, - OLAP_FIELD_AGGREGATION_REPLACE, - 8, - true, - true); - tablet_schema.push_back(field_info); - - CreateColumnWriter(tablet_schema); - - RowCursor write_row; - write_row.init(tablet_schema); - - std::vector val_string_array; - val_string_array.push_back("1234.5678"); - write_row.from_string(val_string_array); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); - - val_string_array.clear(); - val_string_array.push_back("5678.1234"); - write_row.from_string(val_string_array); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); - - ColumnDataHeaderMessage header; - ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); - - // read data - CreateColumnReader(tablet_schema); - - RowCursor read_row; - read_row.init(tablet_schema); - - char read_value[20]; - memset(read_value, 0, 20); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - ASSERT_TRUE(strncmp(read_row.to_string().c_str(), "0&1234.5678", strlen("0&1234.5678")) == 0); - - memset(read_value, 0, 20); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - ASSERT_TRUE(strncmp(read_row.to_string().c_str(), "0&5678.1234", strlen("0&5678.1234")) == 0); - -} - -TEST_F(TestColumn, SeekDiscreteDoubleColumnWithPresent) { - // write data - std::vector tablet_schema; - FieldInfo field_info; - SetFieldInfo(field_info, - std::string("DiscreteDoubleColumnWithPresent"), - OLAP_FIELD_TYPE_DISCRETE_DOUBLE, - OLAP_FIELD_AGGREGATION_REPLACE, - 8, - true, - true); - tablet_schema.push_back(field_info); - - CreateColumnWriter(tablet_schema); - - RowCursor write_row; - write_row.init(tablet_schema); - - std::vector val_string_array; - val_string_array.push_back("1234.5678"); - write_row.from_string(val_string_array); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); - - create_and_save_last_position(); - - val_string_array.clear(); - val_string_array.push_back("5678.1234"); - write_row.from_string(val_string_array); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); - - create_and_save_last_position(); - - ColumnDataHeaderMessage header; - ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); - - // read data - CreateColumnReader(tablet_schema); - - RowCursor read_row; - read_row.init(tablet_schema); - - PositionEntryReader entry1; - entry1._positions = _column_writer->index()->mutable_entry(0)->_positions; - entry1._positions_count = _column_writer->index()->mutable_entry(0)->_positions_count; - entry1._statistics.init(OLAP_FIELD_TYPE_NONE, false); - - PositionEntryReader entry2; - entry2._positions = _column_writer->index()->mutable_entry(1)->_positions; - entry2._positions_count = _column_writer->index()->mutable_entry(1)->_positions_count; - entry2._statistics.init(OLAP_FIELD_TYPE_NONE, false); - - PositionProvider position0(&entry1); - PositionProvider position1(&entry2); - - ASSERT_EQ(_column_reader->seek(&position0), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - ASSERT_TRUE(strncmp(read_row.to_string().c_str(), "0&1234.5678", strlen("0&1234.5678")) == 0); - - ASSERT_EQ(_column_reader->seek(&position1), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - ASSERT_TRUE(strncmp(read_row.to_string().c_str(), "0&5678.1234", strlen("0&5678.1234")) == 0); -} - -TEST_F(TestColumn, SkipDiscreteDoubleColumnWithPresent) { - // write data - std::vector tablet_schema; - FieldInfo field_info; - SetFieldInfo(field_info, - std::string("DiscreteDoubleColumnWithPresent"), - OLAP_FIELD_TYPE_DISCRETE_DOUBLE, - OLAP_FIELD_AGGREGATION_REPLACE, - 8, - true, - true); - tablet_schema.push_back(field_info); - - CreateColumnWriter(tablet_schema); - - RowCursor write_row; - write_row.init(tablet_schema); - - std::vector val_string_array; - val_string_array.push_back("1234.5678"); - write_row.from_string(val_string_array); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); - - val_string_array.clear(); - val_string_array.push_back("5678.1234"); - write_row.from_string(val_string_array); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); - - ColumnDataHeaderMessage header; - ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); - - // read data - CreateColumnReader(tablet_schema); - - RowCursor read_row; - read_row.init(tablet_schema); - - PositionEntryReader entry2; - entry2._positions = _column_writer->index_entry()->_positions; - entry2._positions_count = _column_writer->index_entry()->_positions_count; - entry2._statistics.init(OLAP_FIELD_TYPE_NONE, false); - - UniqueIdPositionProviderMap positions_map; - positions_map[0] = PositionProvider(&entry2); - char read_value[20]; - - memset(read_value, 0, 20); - ASSERT_EQ(_column_reader->skip(1), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - ASSERT_TRUE(strncmp(read_row.to_string().c_str(), "0&5678.1234", strlen("0&5678.1234")) == 0); -} - -TEST_F(TestColumn, DatetimeColumnWithoutPresent) { +TEST_F(TestColumn, VectorizedDatetimeColumnWithoutPresent) { // write data std::vector tablet_schema; FieldInfo field_info; @@ -1954,15 +1531,16 @@ TEST_F(TestColumn, DatetimeColumnWithoutPresent) { RowCursor read_row; read_row.init(tablet_schema); - char read_value[20]; - memset(read_value, 0, 20); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - ASSERT_TRUE(strncmp(read_row.to_string().c_str(), "0&2000-10-10 10:10:10", strlen("0&2000-10-10 10:10:10")) == 0); - + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 1, _mem_pool.get()), OLAP_SUCCESS); + char* data = reinterpret_cast(_col_vector->col_data()); + read_row.set_field_content(0, data, _mem_pool.get()); + ASSERT_TRUE(strncmp(read_row.to_string().c_str(), + "0&2000-10-10 10:10:10", strlen("0&2000-10-10 10:10:10")) == 0); } -TEST_F(TestColumn, DatetimeColumnWithPresent) { +TEST_F(TestColumn, VectorizedDatetimeColumnWithPresent) { // write data std::vector tablet_schema; FieldInfo field_info; @@ -1979,10 +1557,13 @@ TEST_F(TestColumn, DatetimeColumnWithPresent) { RowCursor write_row; write_row.init(tablet_schema); - + write_row.set_null(0); + ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + std::vector val_string_array; val_string_array.push_back("2000-10-10 10:10:10"); write_row.from_string(val_string_array); + write_row.set_not_null(0); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; @@ -1994,17 +1575,25 @@ TEST_F(TestColumn, DatetimeColumnWithPresent) { RowCursor read_row; read_row.init(tablet_schema); - char read_value[20]; - memset(read_value, 0, 20); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - ASSERT_TRUE(strncmp(read_row.to_string().c_str(), "0&2000-10-10 10:10:10", strlen("0&2000-10-10 10:10:10")) == 0); + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 2, _mem_pool.get()), OLAP_SUCCESS); + bool* is_null = _col_vector->is_null(); + ASSERT_EQ(is_null[0], true); - ASSERT_NE(_column_reader->next(), OLAP_SUCCESS); - + char* data = reinterpret_cast(_col_vector->col_data()); + ASSERT_EQ(is_null[1], false); + + data += sizeof(uint64_t); + read_row.set_field_content(0, data, _mem_pool.get()); + ASSERT_TRUE(strncmp(read_row.to_string().c_str(), + "0&2000-10-10 10:10:10", strlen("0&2000-10-10 10:10:10")) == 0); + + ASSERT_NE(_column_reader->next_vector( + _col_vector.get(), 2, _mem_pool.get()), OLAP_SUCCESS); } -TEST_F(TestColumn, DateColumnWithoutPresent) { +TEST_F(TestColumn, VectorizedDateColumnWithoutPresent) { // write data std::vector tablet_schema; FieldInfo field_info; @@ -2036,14 +1625,15 @@ TEST_F(TestColumn, DateColumnWithoutPresent) { RowCursor read_row; read_row.init(tablet_schema); - char read_value[20]; - memset(read_value, 0, 20); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 1, _mem_pool.get()), OLAP_SUCCESS); + char* data = reinterpret_cast(_col_vector->col_data()); + read_row.set_field_content(0, data, _mem_pool.get()); ASSERT_TRUE(strncmp(read_row.to_string().c_str(), "0&2000-10-10", strlen("0&2000-10-10")) == 0); } -TEST_F(TestColumn, DateColumnWithPresent) { +TEST_F(TestColumn, VectorizedDateColumnWithPresent) { // write data std::vector tablet_schema; FieldInfo field_info; @@ -2060,11 +1650,15 @@ TEST_F(TestColumn, DateColumnWithPresent) { RowCursor write_row; write_row.init(tablet_schema); - + + write_row.set_null(0); + ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + std::vector val_string_array; val_string_array.push_back("2000-10-10"); write_row.from_string(val_string_array); for (uint32_t i = 0; i < 100; ++i) { + write_row.set_not_null(0); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); } @@ -2077,18 +1671,23 @@ TEST_F(TestColumn, DateColumnWithPresent) { RowCursor read_row; read_row.init(tablet_schema); - char read_value[20]; - for (uint32_t i = 0; i < 100; ++i) { - memset(read_value, 0, 20); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - ASSERT_TRUE(strncmp(read_row.to_string().c_str(), "0&2000-10-10", strlen("0&2000-10-10")) == 0); - } + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 101, _mem_pool.get()), OLAP_SUCCESS); + bool* is_null = _col_vector->is_null(); + ASSERT_EQ(is_null[0], true); - ASSERT_NE(_column_reader->next(), OLAP_SUCCESS); + char* data = reinterpret_cast(_col_vector->col_data()); + for (uint32_t i = 0; i < 100; ++i) { + data += sizeof(uint24_t); + ASSERT_EQ(is_null[i+1], false); + read_row.set_field_content(0, data, _mem_pool.get()); + ASSERT_TRUE(strncmp(read_row.to_string().c_str(), + "0&2000-10-10", strlen("0&2000-10-10")) == 0); + } } -TEST_F(TestColumn, DecimalColumnWithoutPresent) { +TEST_F(TestColumn, VectorizedDecimalColumnWithoutPresent) { // write data std::vector tablet_schema; FieldInfo field_info; @@ -2125,19 +1724,19 @@ TEST_F(TestColumn, DecimalColumnWithoutPresent) { RowCursor read_row; read_row.init(tablet_schema); - char read_value[20]; - memset(read_value, 0, 20); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 2, _mem_pool.get()), OLAP_SUCCESS); + char* data = reinterpret_cast(_col_vector->col_data()); + read_row.set_field_content(0, data, _mem_pool.get()); ASSERT_TRUE(strncmp(read_row.to_string().c_str(), "0&1234.5678", strlen("0&1234.5678")) == 0); - memset(read_value, 0, 20); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); + data += sizeof(decimal12_t); + read_row.set_field_content(0, data, _mem_pool.get()); ASSERT_TRUE(strncmp(read_row.to_string().c_str(), "0&5678.1234", strlen("0&5678.1234")) == 0); } -TEST_F(TestColumn, DecimalColumnWithPresent) { +TEST_F(TestColumn, VectorizedDecimalColumnWithPresent) { // write data std::vector tablet_schema; FieldInfo field_info; @@ -2156,13 +1755,13 @@ TEST_F(TestColumn, DecimalColumnWithPresent) { write_row.init(tablet_schema); std::vector val_string_array; - val_string_array.push_back("1234.5678"); - write_row.from_string(val_string_array); + write_row.set_null(0); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); val_string_array.clear(); val_string_array.push_back("5678.1234"); write_row.from_string(val_string_array); + write_row.set_not_null(0); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; @@ -2174,15 +1773,16 @@ TEST_F(TestColumn, DecimalColumnWithPresent) { RowCursor read_row; read_row.init(tablet_schema); - char read_value[20]; - memset(read_value, 0, 20); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - ASSERT_TRUE(strncmp(read_row.to_string().c_str(), "0&1234.5678", strlen("0&1234.5678")) == 0); + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 2, _mem_pool.get()), OLAP_SUCCESS); + bool* is_null = _col_vector->is_null(); + ASSERT_EQ(is_null[0], true); - memset(read_value, 0, 20); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); + char* data = reinterpret_cast(_col_vector->col_data()); + data += sizeof(decimal12_t); + ASSERT_EQ(is_null[1], false); + read_row.set_field_content(0, data, _mem_pool.get()); ASSERT_TRUE(strncmp(read_row.to_string().c_str(), "0&5678.1234", strlen("0&5678.1234")) == 0); } @@ -2226,8 +1826,11 @@ TEST_F(TestColumn, SkipDecimalColumnWithPresent) { char read_value[20]; memset(read_value, 0, 20); ASSERT_EQ(_column_reader->skip(1), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 1, _mem_pool.get()), OLAP_SUCCESS); + char* data = reinterpret_cast(_col_vector->col_data()); + read_row.set_field_content(0, data, _mem_pool.get()); ASSERT_TRUE(strncmp(read_row.to_string().c_str(), "0&5678.1234", strlen("0&5678.1234")) == 0); } @@ -2274,12 +1877,12 @@ TEST_F(TestColumn, SeekDecimalColumnWithPresent) { PositionEntryReader entry1; entry1._positions = _column_writer->index()->mutable_entry(0)->_positions; entry1._positions_count = _column_writer->index()->mutable_entry(0)->_positions_count; - entry1._statistics.init(OLAP_FIELD_TYPE_NONE, false); + entry1._statistics.init(OLAP_FIELD_TYPE_FLOAT, false); PositionEntryReader entry2; entry2._positions = _column_writer->index()->mutable_entry(1)->_positions; entry2._positions_count = _column_writer->index()->mutable_entry(1)->_positions_count; - entry2._statistics.init(OLAP_FIELD_TYPE_NONE, false); + entry2._statistics.init(OLAP_FIELD_TYPE_FLOAT, false); PositionProvider position0(&entry1); PositionProvider position1(&entry2); @@ -2287,18 +1890,24 @@ TEST_F(TestColumn, SeekDecimalColumnWithPresent) { char read_value[20]; memset(read_value, 0, 20); ASSERT_EQ(_column_reader->seek(&position0), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 1, _mem_pool.get()), OLAP_SUCCESS); + char* data = reinterpret_cast(_col_vector->col_data()); + read_row.set_field_content(0, data, _mem_pool.get()); ASSERT_TRUE(strncmp(read_row.to_string().c_str(), "0&1234.5678", strlen("0&1234.5678")) == 0); memset(read_value, 0, 20); ASSERT_EQ(_column_reader->seek(&position1), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 1, _mem_pool.get()), OLAP_SUCCESS); + data = reinterpret_cast(_col_vector->col_data()); + read_row.set_field_content(0, data, _mem_pool.get()); ASSERT_TRUE(strncmp(read_row.to_string().c_str(), "0&5678.1234", strlen("0&5678.1234")) == 0); } -TEST_F(TestColumn, LargeIntColumnWithoutPresent) { +TEST_F(TestColumn, VectorizedLargeIntColumnWithoutPresent) { // init table schema std::vector tablet_schema; FieldInfo field_info; @@ -2338,18 +1947,20 @@ TEST_F(TestColumn, LargeIntColumnWithoutPresent) { RowCursor read_row; read_row.init(tablet_schema); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 2, _mem_pool.get()), OLAP_SUCCESS); + char* data = reinterpret_cast(_col_vector->col_data()); + read_row.set_field_content(0, data, _mem_pool.get()); value1 = "0&" + value1; value2 = "0&" + value2; ASSERT_TRUE(strncmp(read_row.to_string().c_str(), value1.c_str(), value1.size()) == 0); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); + read_row.set_field_content(0, data + sizeof(int128_t), _mem_pool.get()); ASSERT_TRUE(strncmp(read_row.to_string().c_str(), value2.c_str(), value2.size()) == 0); } -TEST_F(TestColumn, LargeIntColumnWithPresent) { +TEST_F(TestColumn, VectorizedLargeIntColumnWithPresent) { // init table schema std::vector tablet_schema; FieldInfo field_info; @@ -2371,14 +1982,19 @@ TEST_F(TestColumn, LargeIntColumnWithPresent) { RowCursor write_row; write_row.init(tablet_schema); + write_row.set_null(0); + ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + std::vector val_string_array; val_string_array.push_back(value1); write_row.from_string(val_string_array); + write_row.set_not_null(0); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); val_string_array.clear(); val_string_array.push_back(value2); write_row.from_string(val_string_array); + write_row.set_not_null(0); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; @@ -2389,14 +2005,24 @@ TEST_F(TestColumn, LargeIntColumnWithPresent) { RowCursor read_row; read_row.init(tablet_schema); + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 3, _mem_pool.get()), OLAP_SUCCESS); + bool* is_null = _col_vector->is_null(); + ASSERT_EQ(is_null[0], true); + ASSERT_EQ(is_null[1], false); + ASSERT_EQ(is_null[2], false); + + char* data = reinterpret_cast(_col_vector->col_data()); value1 = "0&" + value1; value2 = "0&" + value2; - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); + + data += sizeof(int128_t); + read_row.set_field_content(0, data, _mem_pool.get()); ASSERT_TRUE(strncmp(read_row.to_string().c_str(), value1.c_str(), value1.size()) == 0); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); + data += sizeof(int128_t); + read_row.set_field_content(0, data, _mem_pool.get()); ASSERT_TRUE(strncmp(read_row.to_string().c_str(), value2.c_str(), value2.size()) == 0); } @@ -2442,8 +2068,11 @@ TEST_F(TestColumn, SkipLargeIntColumnWithPresent) { value2 = "0&" + value2; ASSERT_EQ(_column_reader->skip(1), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 1, _mem_pool.get()), OLAP_SUCCESS); + char* data = reinterpret_cast(_col_vector->col_data()); + read_row.set_field_content(0, data, _mem_pool.get()); ASSERT_TRUE(strncmp(read_row.to_string().c_str(), value2.c_str(), value2.size()) == 0); } @@ -2500,12 +2129,12 @@ TEST_F(TestColumn, SkipLargeIntColumnWithPresent) { // PositionEntryReader entry0; // entry0._positions = _column_writer->index()->mutable_entry(0)->_positions; // entry0._positions_count = _column_writer->index()->mutable_entry(0)->_positions_count; - // entry0._statistics.init(OLAP_FIELD_TYPE_NONE); + // entry0._statistics.init(OLAP_FIELD_TYPE_LARGEINT); // PositionEntryReader entry1; // entry1._positions = _column_writer->index()->mutable_entry(1)->_positions; // entry1._positions_count = _column_writer->index()->mutable_entry(1)->_positions_count; - // entry1._statistics.init(OLAP_FIELD_TYPE_NONE); + // entry1._statistics.init(OLAP_FIELD_TYPE_LARGEINT); // PositionProvider position0(&entry0); // PositionProvider position1(&entry1); @@ -2521,7 +2150,7 @@ TEST_F(TestColumn, SkipLargeIntColumnWithPresent) { // ASSERT_TRUE(strncmp(read_row.to_string().c_str(), value3.c_str(), value3.size()) == 0); // } -TEST_F(TestColumn, DirectVarcharColumnWithoutPresent) { +TEST_F(TestColumn, VectorizedDirectVarcharColumnWithoutPresent) { // write data std::vector tablet_schema; FieldInfo field_info; @@ -2538,6 +2167,7 @@ TEST_F(TestColumn, DirectVarcharColumnWithoutPresent) { RowCursor write_row; write_row.init(tablet_schema); + write_row.allocate_memory_for_string_type(tablet_schema); std::vector val_string_array; val_string_array.push_back("YWJjZGU="); //"abcde" base_64_encode is "YWJjZGU=" @@ -2552,7 +2182,6 @@ TEST_F(TestColumn, DirectVarcharColumnWithoutPresent) { for (uint32_t i = 0; i < 2; i++) { ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); } - ColumnDataHeaderMessage header; ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); @@ -2562,26 +2191,27 @@ TEST_F(TestColumn, DirectVarcharColumnWithoutPresent) { RowCursor read_row; read_row.init(tablet_schema); + read_row.allocate_memory_for_string_type(tablet_schema); - char read_value[20]; - memset(read_value, 0, 20); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - ASSERT_TRUE(strcmp(read_row.to_string().c_str(), "0&YWJjZGU=") == 0); + _col_vector.reset(new ColumnVector()); + + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 5, _mem_pool.get()), OLAP_SUCCESS); + StringSlice* value = reinterpret_cast(_col_vector->col_data()); + ASSERT_TRUE(strncmp(value->data, "YWJjZGU=", value->size) == 0); for (uint32_t i = 0; i < 2; i++) { - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - ASSERT_TRUE(strcmp(read_row.to_string().c_str(), "0&YWJjZGU=") == 0); + value++; + ASSERT_TRUE(strncmp(value->data, "YWJjZGU=", value->size) == 0); } for (uint32_t i = 0; i < 2; i++) { - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - ASSERT_TRUE(strcmp(read_row.to_string().c_str(), "0&ZWRjYmE=") == 0); + value++; + ASSERT_TRUE(strncmp(value->data, "ZWRjYmE=", value->size) == 0); } - ASSERT_NE(_column_reader->next(), OLAP_SUCCESS); + ASSERT_NE(_column_reader->next_vector( + _col_vector.get(), 1, _mem_pool.get()), OLAP_SUCCESS); } -TEST_F(TestColumn, DirectVarcharColumnWithPresent) { +TEST_F(TestColumn, VectorizedDirectVarcharColumnWithPresent) { // write data std::vector tablet_schema; FieldInfo field_info; @@ -2598,10 +2228,15 @@ TEST_F(TestColumn, DirectVarcharColumnWithPresent) { RowCursor write_row; write_row.init(tablet_schema); + write_row.allocate_memory_for_string_type(tablet_schema); + + write_row.set_null(0); + ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); std::vector val_string_array; val_string_array.push_back("YWJjZGU="); //"abcde" base_64_encode is "YWJjZGU=" write_row.from_string(val_string_array); + write_row.set_not_null(0); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; @@ -2612,12 +2247,18 @@ TEST_F(TestColumn, DirectVarcharColumnWithPresent) { RowCursor read_row; read_row.init(tablet_schema); + read_row.allocate_memory_for_string_type(tablet_schema); - char read_value[20]; - memset(read_value, 0, 20); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - ASSERT_TRUE(strcmp(read_row.to_string().c_str(), "0&YWJjZGU=") == 0); + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 2, _mem_pool.get()), OLAP_SUCCESS); + bool* is_null = _col_vector->is_null(); + ASSERT_EQ(is_null[0], true); + ASSERT_EQ(is_null[1], false); + + StringSlice* value = reinterpret_cast(_col_vector->col_data()); + value++; + ASSERT_TRUE(strncmp(value->data, "YWJjZGU=", value->size) == 0); } TEST_F(TestColumn, SkipDirectVarcharColumnWithPresent) { @@ -2637,6 +2278,7 @@ TEST_F(TestColumn, SkipDirectVarcharColumnWithPresent) { RowCursor write_row; write_row.init(tablet_schema); + write_row.allocate_memory_for_string_type(tablet_schema); std::vector val_string_array; val_string_array.push_back("YWJjZGU="); //"abcde" base_64_encode is "YWJjZGU=" @@ -2656,13 +2298,16 @@ TEST_F(TestColumn, SkipDirectVarcharColumnWithPresent) { RowCursor read_row; read_row.init(tablet_schema); + read_row.allocate_memory_for_string_type(tablet_schema); char read_value[20]; memset(read_value, 0, 20); ASSERT_EQ(_column_reader->skip(1), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - ASSERT_TRUE(strcmp(read_row.to_string().c_str(), "0&YWFhYWE=") == 0); + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 1, _mem_pool.get()), OLAP_SUCCESS); + StringSlice* value = reinterpret_cast(_col_vector->col_data()); + ASSERT_TRUE(strncmp(value->data, "YWFhYWE=", value->size) == 0); } TEST_F(TestColumn, SeekDirectVarcharColumnWithoutPresent) { @@ -2682,6 +2327,7 @@ TEST_F(TestColumn, SeekDirectVarcharColumnWithoutPresent) { RowCursor write_row; write_row.init(tablet_schema); + write_row.allocate_memory_for_string_type(tablet_schema); std::vector val_string_array; val_string_array.push_back("YWJjZGU="); //"abcde" base_64_encode is "YWJjZGU=" @@ -2705,32 +2351,34 @@ TEST_F(TestColumn, SeekDirectVarcharColumnWithoutPresent) { RowCursor read_row; read_row.init(tablet_schema); - + read_row.allocate_memory_for_string_type(tablet_schema); PositionEntryReader entry1; entry1._positions = _column_writer->index()->mutable_entry(0)->_positions; entry1._positions_count = _column_writer->index()->mutable_entry(0)->_positions_count; - entry1._statistics.init(OLAP_FIELD_TYPE_NONE, false); + entry1._statistics.init(OLAP_FIELD_TYPE_VARCHAR, false); PositionEntryReader entry2; entry2._positions = _column_writer->index()->mutable_entry(1)->_positions; entry2._positions_count = _column_writer->index()->mutable_entry(1)->_positions_count; - entry2._statistics.init(OLAP_FIELD_TYPE_NONE, false); + entry2._statistics.init(OLAP_FIELD_TYPE_VARCHAR, false); PositionProvider position0(&entry1); PositionProvider position1(&entry2); ASSERT_EQ(_column_reader->seek(&position0), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - ASSERT_TRUE(strcmp(read_row.to_string().c_str(), "0&YWJjZGU=") == 0); + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 1, _mem_pool.get()), OLAP_SUCCESS); + StringSlice* value = reinterpret_cast(_col_vector->col_data()); + ASSERT_TRUE(strncmp(value->data, "YWJjZGU=", value->size) == 0); ASSERT_EQ(_column_reader->seek(&position1), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - ASSERT_TRUE(strcmp(read_row.to_string().c_str(), "0&YWFhYWE=") == 0); - - ASSERT_NE(_column_reader->next(), OLAP_SUCCESS); + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 1, _mem_pool.get()), OLAP_SUCCESS); + value = reinterpret_cast(_col_vector->col_data()); + ASSERT_TRUE(strncmp(value->data, "YWFhYWE=", value->size) == 0); } TEST_F(TestColumn, SeekDirectVarcharColumnWithPresent) { @@ -2750,6 +2398,7 @@ TEST_F(TestColumn, SeekDirectVarcharColumnWithPresent) { RowCursor write_row; write_row.init(tablet_schema); + write_row.allocate_memory_for_string_type(tablet_schema); std::vector val_string_array; val_string_array.push_back("YWJjZGU="); //"abcde" base_64_encode is "YWJjZGU=" @@ -2773,35 +2422,37 @@ TEST_F(TestColumn, SeekDirectVarcharColumnWithPresent) { RowCursor read_row; read_row.init(tablet_schema); + read_row.allocate_memory_for_string_type(tablet_schema); PositionEntryReader entry1; entry1._positions = _column_writer->index()->mutable_entry(0)->_positions; entry1._positions_count = _column_writer->index()->mutable_entry(0)->_positions_count; - entry1._statistics.init(OLAP_FIELD_TYPE_NONE, false); + entry1._statistics.init(OLAP_FIELD_TYPE_VARCHAR, false); PositionEntryReader entry2; entry2._positions = _column_writer->index()->mutable_entry(1)->_positions; entry2._positions_count = _column_writer->index()->mutable_entry(1)->_positions_count; - entry2._statistics.init(OLAP_FIELD_TYPE_NONE, false); + entry2._statistics.init(OLAP_FIELD_TYPE_VARCHAR, false); PositionProvider position0(&entry1); PositionProvider position1(&entry2); ASSERT_EQ(_column_reader->seek(&position0), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - ASSERT_TRUE(strcmp(read_row.to_string().c_str(), "0&YWJjZGU=") == 0); - + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 1, _mem_pool.get()), OLAP_SUCCESS); + StringSlice* value = reinterpret_cast(_col_vector->col_data()); + ASSERT_TRUE(strncmp(value->data, "YWJjZGU=", value->size) == 0); ASSERT_EQ(_column_reader->seek(&position1), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - ASSERT_TRUE(strcmp(read_row.to_string().c_str(), "0&YWFhYWE=") == 0); - - ASSERT_NE(_column_reader->next(), OLAP_SUCCESS); + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 1, _mem_pool.get()), OLAP_SUCCESS); + value = reinterpret_cast(_col_vector->col_data()); + ASSERT_TRUE(strncmp(value->data, "YWFhYWE=", value->size) == 0); } -TEST_F(TestColumn, StringColumnWithoutPresent) { +TEST_F(TestColumn, VectorizedStringColumnWithoutPresent) { // write data std::vector tablet_schema; FieldInfo field_info; @@ -2818,6 +2469,7 @@ TEST_F(TestColumn, StringColumnWithoutPresent) { RowCursor write_row; write_row.init(tablet_schema); + write_row.allocate_memory_for_string_type(tablet_schema); std::vector val_string_array; val_string_array.push_back("abcde"); //"abcde" base_64_encode is "YWJjZGU=" @@ -2841,26 +2493,27 @@ TEST_F(TestColumn, StringColumnWithoutPresent) { RowCursor read_row; read_row.init(tablet_schema); + read_row.allocate_memory_for_string_type(tablet_schema); - char read_value[20]; - memset(read_value, 0, 20); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - ASSERT_TRUE(strcmp(read_row.to_string().c_str(), "0&abcde") == 0); + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 5, _mem_pool.get()), OLAP_SUCCESS); + StringSlice* value = reinterpret_cast(_col_vector->col_data()); + + ASSERT_TRUE(strncmp(value->data, "abcde", value->size) == 0); for (uint32_t i = 0; i < 2; i++) { - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - ASSERT_TRUE(strcmp(read_row.to_string().c_str(), "0&abcde") == 0); + value++; + ASSERT_TRUE(strncmp(value->data, "abcde", value->size) == 0); } for (uint32_t i = 0; i < 2; i++) { - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - ASSERT_TRUE(strcmp(read_row.to_string().c_str(), "0&edcba") == 0); + value++; + ASSERT_TRUE(strncmp(value->data, "edcba", value->size) == 0); } - ASSERT_NE(_column_reader->next(), OLAP_SUCCESS); + ASSERT_NE(_column_reader->next_vector( + _col_vector.get(), 1, _mem_pool.get()), OLAP_SUCCESS); } -TEST_F(TestColumn, StringColumnWithPresent) { +TEST_F(TestColumn, VectorizedStringColumnWithPresent) { // write data std::vector tablet_schema; FieldInfo field_info; @@ -2877,10 +2530,14 @@ TEST_F(TestColumn, StringColumnWithPresent) { RowCursor write_row; write_row.init(tablet_schema); + write_row.allocate_memory_for_string_type(tablet_schema); + write_row.set_null(0); + ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); std::vector val_string_array; val_string_array.push_back("abcde"); //"abcde" base_64_encode is "YWJjZGU=" write_row.from_string(val_string_array); + write_row.set_not_null(0); ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; @@ -2891,15 +2548,21 @@ TEST_F(TestColumn, StringColumnWithPresent) { RowCursor read_row; read_row.init(tablet_schema); + read_row.allocate_memory_for_string_type(tablet_schema); - char read_value[20]; - memset(read_value, 0, 20); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - ASSERT_TRUE(strcmp(read_row.to_string().c_str(), "0&abcde") == 0); + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 2, _mem_pool.get()), OLAP_SUCCESS); + bool* is_null = _col_vector->is_null(); + ASSERT_EQ(is_null[0], true); + ASSERT_EQ(is_null[1], false); + + StringSlice* value = reinterpret_cast(_col_vector->col_data()); + value++; + ASSERT_TRUE(strncmp(value->data, "abcde", value->size) == 0); } -TEST_F(TestColumn, StringColumnWithoutoutPresent2) { +TEST_F(TestColumn, VectorizedStringColumnWithoutoutPresent2) { // write data std::vector tablet_schema; FieldInfo field_info; @@ -2916,6 +2579,7 @@ TEST_F(TestColumn, StringColumnWithoutoutPresent2) { RowCursor write_row; write_row.init(tablet_schema); + write_row.allocate_memory_for_string_type(tablet_schema); std::vector val_string_array; val_string_array.push_back("abcde"); //"abcde" base_64_encode is "YWJjZGU=" @@ -2946,31 +2610,29 @@ TEST_F(TestColumn, StringColumnWithoutoutPresent2) { RowCursor read_row; read_row.init(tablet_schema); + read_row.allocate_memory_for_string_type(tablet_schema); - char read_value[20]; - memset(read_value, 0, 20); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - ASSERT_STREQ(read_row._field_array[0]->_buf, "abcde"); + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 5, _mem_pool.get()), OLAP_SUCCESS); + StringSlice* value = reinterpret_cast(_col_vector->col_data()); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - ASSERT_STREQ(read_row._field_array[0]->_buf, "aaaaa"); + ASSERT_TRUE(strncmp(value->data, "abcde", value->size) == 0); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - ASSERT_STREQ(read_row._field_array[0]->_buf, "bbbbb"); + value++; + ASSERT_TRUE(strncmp(value->data, "aaaaa", value->size) == 0); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - ASSERT_STREQ(read_row._field_array[0]->_buf, "ccccc"); + value++; + ASSERT_TRUE(strncmp(value->data, "bbbbb", value->size) == 0); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - ASSERT_STREQ(read_row._field_array[0]->_buf, "ddddd"); + value++; + ASSERT_TRUE(strncmp(value->data, "ccccc", value->size) == 0); + + value++; + ASSERT_TRUE(strncmp(value->data, "ddddd", value->size) == 0); } -TEST_F(TestColumn, DirectVarcharColumnWith65533) { +TEST_F(TestColumn, VectorizedDirectVarcharColumnWith65533) { // write data std::vector tablet_schema; FieldInfo field_info; @@ -2987,6 +2649,7 @@ TEST_F(TestColumn, DirectVarcharColumnWith65533) { RowCursor write_row; write_row.init(tablet_schema); + write_row.allocate_memory_for_string_type(tablet_schema); std::vector val_string_array; val_string_array.push_back(std::string(65533, 'a')); @@ -3008,28 +2671,23 @@ TEST_F(TestColumn, DirectVarcharColumnWith65533) { RowCursor read_row; read_row.init(tablet_schema); + read_row.allocate_memory_for_string_type(tablet_schema); - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); + _col_vector.reset(new ColumnVector()); + ASSERT_EQ(_column_reader->next_vector( + _col_vector.get(), 3, _mem_pool.get()), OLAP_SUCCESS); + StringSlice* value = reinterpret_cast(_col_vector->col_data()); - for (uint32_t i = 0; i < 65533 + 2; i++) { - if (0 == i) { - ASSERT_EQ(read_row.to_string().c_str()[i], '0'); - } else if (1 == i) { - ASSERT_EQ(read_row.to_string().c_str()[i], '&'); - } else { - ASSERT_EQ(read_row.to_string().c_str()[i], 'a'); - } + for (uint32_t i = 0; i < 65533; i++) { + ASSERT_TRUE(strncmp(value->data + i, "a", 1) == 0); } for (uint32_t i = 0; i < 2; i++) { - ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - ASSERT_TRUE(strcmp(read_row.to_string().c_str(), "0&edcba") == 0); + value++; + ASSERT_TRUE(strncmp(value->data, "edcba", value->size) == 0); } } - } } @@ -3043,7 +2701,6 @@ int main(int argc, char** argv) { int ret = palo::OLAP_SUCCESS; testing::InitGoogleTest(&argc, argv); ret = RUN_ALL_TESTS(); - google::protobuf::ShutdownProtobufLibrary(); return ret; } diff --git a/be/test/olap/command_executor_test.cpp b/be/test/olap/command_executor_test.cpp index 5ca581a2f8..3ef0b4ef82 100644 --- a/be/test/olap/command_executor_test.cpp +++ b/be/test/olap/command_executor_test.cpp @@ -64,7 +64,7 @@ static const char* ROLLUP_TABLE_PUSH_DATA = "./be/test/olap/test_data/all_types_ // checksum for base table push data static const uint32_t MAX_RETRY_TIMES = 10; -static const uint32_t BASE_TABLE_PUSH_DATA_CHECKSUM = 3878734322; +static const uint32_t BASE_TABLE_PUSH_DATA_CHECKSUM = 1401759800; static const uint32_t MAX_PATH_LEN = 1024; @@ -849,7 +849,7 @@ TEST_F(TestComputeChecksum, compute_checksum) { request.tablet_id, request.tablet_schema.schema_hash, request.version, request.version_hash, &checksum); ASSERT_EQ(OLAP_SUCCESS, res); - ASSERT_EQ(CRC32_INIT, checksum); + ASSERT_EQ(0, checksum); // 3. Compute checksum normally. tablets_info.clear(); @@ -864,10 +864,10 @@ TEST_F(TestComputeChecksum, compute_checksum) { ASSERT_EQ(BASE_TABLE_PUSH_DATA_CHECKSUM, checksum); } -class TestBaseExpansion : public ::testing::Test { +class TestBaseCompaction : public ::testing::Test { public: - TestBaseExpansion() : _command_executor(NULL) {} - ~TestBaseExpansion() { + TestBaseCompaction() : _command_executor(NULL) {} + ~TestBaseCompaction() { SAFE_DELETE(_command_executor); } @@ -875,7 +875,7 @@ public: // Create local data dir for OLAPEngine. char buffer[MAX_PATH_LEN]; getcwd(buffer, MAX_PATH_LEN); - config::storage_root_path = string(buffer) + "/test_run/data_base_expansion"; + config::storage_root_path = string(buffer) + "/test_run/data_base_compaction"; remove_all_dir(config::storage_root_path); ASSERT_EQ(create_dir(config::storage_root_path), OLAP_SUCCESS); @@ -894,7 +894,7 @@ public: CommandExecutor* _command_executor; }; -TEST_F(TestBaseExpansion, TestBaseExpansion) { +TEST_F(TestBaseCompaction, TestBaseCompaction) { OLAPStatus res = OLAP_SUCCESS; TCreateTabletReq request; set_default_create_tablet_request(&request); @@ -904,7 +904,7 @@ TEST_F(TestBaseExpansion, TestBaseExpansion) { std::vector tablets_info; // 1. Start BE before tablet created. - res = _command_executor->base_expansion( + res = _command_executor->base_compaction( push_req.tablet_id, push_req.schema_hash, push_req.version); ASSERT_EQ(OLAP_ERR_TABLE_NOT_FOUND, res); @@ -915,7 +915,7 @@ TEST_F(TestBaseExpansion, TestBaseExpansion) { request.tablet_id, request.tablet_schema.schema_hash); ASSERT_TRUE(tablet.get() != NULL); - res = _command_executor->base_expansion( + res = _command_executor->base_compaction( request.tablet_id, request.tablet_schema.schema_hash, request.version + 1); ASSERT_EQ(OLAP_ERR_BE_NO_SUITABLE_VERSION, res); } @@ -1311,7 +1311,7 @@ TEST_F(TestSchemaChange, schema_change) { ASSERT_EQ(push_req.version_hash, tablet_info.version_hash); ASSERT_EQ(100, tablet_info.row_count); - //schema change, add a value column + //schema change, modify a key column TCreateTabletReq create_new_tablet4; set_create_tablet_request_4(create_new_tablet3, &create_new_tablet4); TAlterTabletReq request4; @@ -1838,6 +1838,7 @@ int main(int argc, char** argv) { palo::init_glog("be-test"); int ret = palo::OLAP_SUCCESS; testing::InitGoogleTest(&argc, argv); + palo::CpuInfo::init(); palo::set_up(); ret = RUN_ALL_TESTS(); diff --git a/be/test/olap/comparison_predicate_test.cpp b/be/test/olap/comparison_predicate_test.cpp new file mode 100644 index 0000000000..8b50feb5ab --- /dev/null +++ b/be/test/olap/comparison_predicate_test.cpp @@ -0,0 +1,963 @@ +// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "olap/field.h" +#include "olap/column_predicate.h" +#include "olap/comparison_predicate.h" +#include "runtime/mem_pool.h" +#include "runtime/string_value.hpp" +#include "runtime/vectorized_row_batch.h" +#include "util/logging.h" + +namespace palo { + +namespace datetime { + +static uint24_t to_date_timestamp(const char* date_string) { + tm time_tm; + strptime(date_string, "%Y-%m-%d", &time_tm); + + int value = (time_tm.tm_year + 1900) * 16 * 32 + + (time_tm.tm_mon + 1) * 32 + + time_tm.tm_mday; + return uint24_t(value); +} + +static uint64_t to_datetime_timestamp(const std::string& value_string) { + tm time_tm; + strptime(value_string.c_str(), "%Y-%m-%d %H:%M:%S", &time_tm); + + uint64_t value = ((time_tm.tm_year + 1900) * 10000L + + (time_tm.tm_mon + 1) * 100L + + time_tm.tm_mday) * 1000000L + + time_tm.tm_hour * 10000L + + time_tm.tm_min * 100L + + time_tm.tm_sec; + + return value; +} + +static std::string to_date_string(uint24_t& date_value) { + tm time_tm; + int value = date_value; + memset(&time_tm, 0, sizeof(time_tm)); + time_tm.tm_mday = static_cast(value & 31); + time_tm.tm_mon = static_cast(value >> 5 & 15) - 1; + time_tm.tm_year = static_cast(value >> 9) - 1900; + char buf[20] = {'\0'}; + strftime(buf, sizeof(buf), "%Y-%m-%d", &time_tm); + return std::string(buf); +} + +static std::string to_datetime_string(uint64_t& datetime_value) { + tm time_tm; + int64_t part1 = (datetime_value / 1000000L); + int64_t part2 = (datetime_value - part1 * 1000000L); + + time_tm.tm_year = static_cast((part1 / 10000L) % 10000) - 1900; + time_tm.tm_mon = static_cast((part1 / 100) % 100) - 1; + time_tm.tm_mday = static_cast(part1 % 100); + + time_tm.tm_hour = static_cast((part2 / 10000L) % 10000); + time_tm.tm_min = static_cast((part2 / 100) % 100); + time_tm.tm_sec = static_cast(part2 % 100); + + char buf[20] = {'\0'}; + strftime(buf, 20, "%Y-%m-%d %H:%M:%S", &time_tm); + return std::string(buf); +} + +}; + +#define TEST_PREDICATE_DEFINITION(CLASS_NAME) \ +class CLASS_NAME : public testing::Test { \ +public: \ + CLASS_NAME() : _vectorized_batch(NULL) { \ + _mem_tracker.reset(new MemTracker(-1)); \ + _mem_pool.reset(new MemPool(_mem_tracker.get())); \ + } \ + ~CLASS_NAME() {\ + if (_vectorized_batch != NULL) { \ + delete _vectorized_batch; \ + } \ + } \ + void SetFieldInfo(FieldInfo &field_info, std::string name, \ + FieldType type, FieldAggregationMethod aggregation, \ + uint32_t length, bool is_allow_null, bool is_key) { \ + field_info.name = name; \ + field_info.type = type; \ + field_info.aggregation = aggregation; \ + field_info.length = length; \ + field_info.is_allow_null = is_allow_null; \ + field_info.is_key = is_key; \ + field_info.precision = 1000; \ + field_info.frac = 10000; \ + field_info.unique_id = 0; \ + field_info.is_bf_column = false; \ + } \ + void InitVectorizedBatch(const std::vector& schema, \ + const std::vector&ids, \ + int size) { \ + _vectorized_batch = new VectorizedRowBatch(schema, ids, size); \ + _vectorized_batch->set_size(size); \ + } \ + std::unique_ptr _mem_tracker; \ + std::unique_ptr _mem_pool; \ + VectorizedRowBatch* _vectorized_batch; \ +}; \ + +TEST_PREDICATE_DEFINITION(TestEqualPredicate) +TEST_PREDICATE_DEFINITION(TestLessPredicate) + +#define TEST_EQUAL_PREDICATE(TYPE, TYPE_NAME, FIELD_TYPE) \ +TEST_F(TestEqualPredicate, TYPE_NAME##_COLUMN) { \ + std::vector schema; \ + FieldInfo field_info; \ + SetFieldInfo(field_info, std::string("TYPE_NAME##_COLUMN"), FIELD_TYPE, \ + OLAP_FIELD_AGGREGATION_REPLACE, 1, false, true); \ + schema.push_back(field_info); \ + int size = 10; \ + std::vector return_columns; \ + for (int i = 0; i < schema.size(); ++i) { \ + return_columns.push_back(i); \ + } \ + InitVectorizedBatch(schema, return_columns, size); \ + ColumnVector* col_vector = _vectorized_batch->column(0); \ + \ + /* for no nulls */ \ + col_vector->set_no_nulls(true); \ + TYPE* col_data = reinterpret_cast(_mem_pool->allocate(size * sizeof(TYPE))); \ + col_vector->set_col_data(col_data); \ + for (int i = 0; i < size; ++i) { \ + *(col_data + i) = i; \ + } \ + TYPE value = 5; \ + ColumnPredicate* pred = new EqualPredicate(0, value); \ + pred->evaluate(_vectorized_batch); \ + ASSERT_EQ(_vectorized_batch->size(), 1); \ + uint16_t* sel = _vectorized_batch->selected(); \ + ASSERT_EQ(*(col_data + sel[0]), 5); \ + \ + /* for has nulls */ \ + col_vector->set_no_nulls(false); \ + bool* is_null = reinterpret_cast(_mem_pool->allocate(size)); \ + memset(is_null, 0, size); \ + col_vector->set_is_null(is_null); \ + for (int i = 0; i < size; ++i) { \ + if (i % 2 == 0) { \ + is_null[i] = true; \ + } else { \ + *(col_data + i) = i; \ + } \ + } \ + _vectorized_batch->set_size(size); \ + _vectorized_batch->set_selected_in_use(false); \ + pred->evaluate(_vectorized_batch); \ + ASSERT_EQ(_vectorized_batch->size(), 1); \ + sel = _vectorized_batch->selected(); \ + ASSERT_EQ(*(col_data + sel[0]), 5); \ +} \ + +TEST_EQUAL_PREDICATE(int8_t, TINYINT, OLAP_FIELD_TYPE_TINYINT) +TEST_EQUAL_PREDICATE(int16_t, SMALLINT, OLAP_FIELD_TYPE_SMALLINT) +TEST_EQUAL_PREDICATE(int32_t, INT, OLAP_FIELD_TYPE_INT) +TEST_EQUAL_PREDICATE(int64_t, BIGINT, OLAP_FIELD_TYPE_BIGINT) +TEST_EQUAL_PREDICATE(int128_t, LARGEINT, OLAP_FIELD_TYPE_LARGEINT) + +TEST_F(TestEqualPredicate, FLOAT_COLUMN) { + std::vector schema; + FieldInfo field_info; + SetFieldInfo(field_info, std::string("FLOAT_COLUMN"), OLAP_FIELD_TYPE_FLOAT, + OLAP_FIELD_AGGREGATION_REPLACE, 1, false, true); + schema.push_back(field_info); + int size = 10; + std::vector return_columns; + for (int i = 0; i < schema.size(); ++i) { + return_columns.push_back(i); + } + InitVectorizedBatch(schema, return_columns, size); + ColumnVector* col_vector = _vectorized_batch->column(0); + + // for no nulls + col_vector->set_no_nulls(true); + float* col_data = reinterpret_cast(_mem_pool->allocate(size * sizeof(float))); + col_vector->set_col_data(col_data); + for (int i = 0; i < size; ++i) { + *(col_data + i) = i; + } + float value = 5.0; + ColumnPredicate* pred = new EqualPredicate(0, value); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 1); + uint16_t* sel = _vectorized_batch->selected(); + ASSERT_FLOAT_EQ(*(col_data + sel[0]), 5.0); + + // for has nulls + col_vector->set_no_nulls(false); + bool* is_null = reinterpret_cast(_mem_pool->allocate(size)); + memset(is_null, 0, size); + col_vector->set_is_null(is_null); + for (int i = 0; i < size; ++i) { + if (i % 2 == 0) { + is_null[i] = true; + } else { + *(col_data + i) = i; + } + } + _vectorized_batch->set_size(size); + _vectorized_batch->set_selected_in_use(false); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 1); + sel = _vectorized_batch->selected(); + ASSERT_FLOAT_EQ(*(col_data + sel[0]), 5.0); +} + +TEST_F(TestEqualPredicate, DOUBLE_COLUMN) { + std::vector schema; + FieldInfo field_info; + SetFieldInfo(field_info, std::string("DOUBLE_COLUMN"), OLAP_FIELD_TYPE_DOUBLE, + OLAP_FIELD_AGGREGATION_REPLACE, 1, false, true); + schema.push_back(field_info); + int size = 10; + std::vector return_columns; + for (int i = 0; i < schema.size(); ++i) { + return_columns.push_back(i); + } + InitVectorizedBatch(schema, return_columns, size); + ColumnVector* col_vector = _vectorized_batch->column(0); + + // for no nulls + col_vector->set_no_nulls(true); + double* col_data = reinterpret_cast(_mem_pool->allocate(size * sizeof(double))); + col_vector->set_col_data(col_data); + for (int i = 0; i < size; ++i) { + *(col_data + i) = i; + } + double value = 5.0; + ColumnPredicate* pred = new EqualPredicate(0, value); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 1); + uint16_t* sel = _vectorized_batch->selected(); + ASSERT_DOUBLE_EQ(*(col_data + sel[0]), 5.0); + + // for has nulls + col_vector->set_no_nulls(false); + bool* is_null = reinterpret_cast(_mem_pool->allocate(size)); + memset(is_null, 0, size); + col_vector->set_is_null(is_null); + for (int i = 0; i < size; ++i) { + if (i % 2 == 0) { + is_null[i] = true; + } else { + *(col_data + i) = i; + } + } + _vectorized_batch->set_size(size); + _vectorized_batch->set_selected_in_use(false); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 1); + sel = _vectorized_batch->selected(); + ASSERT_DOUBLE_EQ(*(col_data + sel[0]), 5.0); +} + +TEST_F(TestEqualPredicate, DECIMAL_COLUMN) { + std::vector schema; + FieldInfo field_info; + SetFieldInfo(field_info, std::string("DECIMAL_COLUMN"), OLAP_FIELD_TYPE_DECIMAL, + OLAP_FIELD_AGGREGATION_REPLACE, 1, false, true); + schema.push_back(field_info); + int size = 10; + std::vector return_columns; + for (int i = 0; i < schema.size(); ++i) { + return_columns.push_back(i); + } + InitVectorizedBatch(schema, return_columns, size); + ColumnVector* col_vector = _vectorized_batch->column(0); + + // for no nulls + col_vector->set_no_nulls(true); + decimal12_t* col_data = reinterpret_cast(_mem_pool->allocate(size * sizeof(decimal12_t))); + col_vector->set_col_data(col_data); + for (int i = 0; i < size; ++i) { + (*(col_data + i)).integer = i; + (*(col_data + i)).fraction = i; + } + decimal12_t value(5, 5); + ColumnPredicate* pred = new EqualPredicate(0, value); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 1); + uint16_t* sel = _vectorized_batch->selected(); + ASSERT_EQ(*(col_data + sel[0]), value); + + // for has nulls + col_vector->set_no_nulls(false); + bool* is_null = reinterpret_cast(_mem_pool->allocate(size)); + memset(is_null, 0, size); + col_vector->set_is_null(is_null); + for (int i = 0; i < size; ++i) { + if (i % 2 == 0) { + is_null[i] = true; + } else { + (*(col_data + i)).integer = i; + (*(col_data + i)).fraction = i; + } + } + + _vectorized_batch->set_size(size); + _vectorized_batch->set_selected_in_use(false); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 1); + sel = _vectorized_batch->selected(); + ASSERT_EQ(*(col_data + sel[0]), value); +} + +TEST_F(TestEqualPredicate, STRING_COLUMN) { + std::vector schema; + FieldInfo field_info; + SetFieldInfo(field_info, std::string("STRING_COLUMN"), OLAP_FIELD_TYPE_VARCHAR, + OLAP_FIELD_AGGREGATION_REPLACE, 1, false, true); + schema.push_back(field_info); + int size = 10; + std::vector return_columns; + for (int i = 0; i < schema.size(); ++i) { + return_columns.push_back(i); + } + InitVectorizedBatch(schema, return_columns, size); + ColumnVector* col_vector = _vectorized_batch->column(0); + + // for no nulls + col_vector->set_no_nulls(true); + StringValue* col_data = reinterpret_cast(_mem_pool->allocate(size * sizeof(StringValue))); + col_vector->set_col_data(col_data); + + char* string_buffer = reinterpret_cast(_mem_pool->allocate(55)); + for (int i = 0; i < size; ++i) { + for (int j = 0; j <= i; ++j) { + string_buffer[j] = 'a' + i; + } + (*(col_data + i)).len = i + 1; + (*(col_data + i)).ptr = string_buffer; + string_buffer += i + 1; + } + + StringValue value; + const char* value_buffer = "dddd"; + value.len = 4; + value.ptr = const_cast(value_buffer); + + ColumnPredicate* pred = new EqualPredicate(0, value); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 1); + uint16_t* sel = _vectorized_batch->selected(); + ASSERT_EQ(sel[0], 3); + ASSERT_EQ(*(col_data + sel[0]), value); + + // for has nulls + col_vector->set_no_nulls(false); + bool* is_null = reinterpret_cast(_mem_pool->allocate(size)); + memset(is_null, 0, size); + col_vector->set_is_null(is_null); + string_buffer = reinterpret_cast(_mem_pool->allocate(55)); + for (int i = 0; i < size; ++i) { + if (i % 2 == 0) { + is_null[i] = true; + } else { + for (int j = 0; j <= i; ++j) { + string_buffer[j] = 'a' + i; + } + (*(col_data + i)).len = i + 1; + (*(col_data + i)).ptr = string_buffer; + } + string_buffer += i + 1; + } + + _vectorized_batch->set_size(size); + _vectorized_batch->set_selected_in_use(false); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 1); + sel = _vectorized_batch->selected(); + ASSERT_EQ(*(col_data + sel[0]), value); +} + +TEST_F(TestEqualPredicate, DATE_COLUMN) { + std::vector schema; + FieldInfo field_info; + SetFieldInfo(field_info, std::string("DATE_COLUMN"), OLAP_FIELD_TYPE_DATE, + OLAP_FIELD_AGGREGATION_REPLACE, 1, false, true); + schema.push_back(field_info); + int size = 6; + std::vector return_columns; + for (int i = 0; i < schema.size(); ++i) { + return_columns.push_back(i); + } + InitVectorizedBatch(schema, return_columns, size); + ColumnVector* col_vector = _vectorized_batch->column(0); + + // for no nulls + col_vector->set_no_nulls(true); + uint24_t* col_data = reinterpret_cast(_mem_pool->allocate(size * sizeof(uint24_t))); + col_vector->set_col_data(col_data); + + std::vector date_array; + date_array.push_back("2017-09-07"); + date_array.push_back("2017-09-08"); + date_array.push_back("2017-09-09"); + date_array.push_back("2017-09-10"); + date_array.push_back("2017-09-11"); + date_array.push_back("2017-09-12"); + for (int i = 0; i < size; ++i) { + uint24_t timestamp = datetime::to_date_timestamp(date_array[i].c_str()); + *(col_data + i) = timestamp; + } + + uint24_t value = datetime::to_date_timestamp("2017-09-10"); + ColumnPredicate* pred = new EqualPredicate(0, value); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 1); + uint16_t* sel = _vectorized_batch->selected(); + ASSERT_EQ(sel[0], 3); + ASSERT_EQ(*(col_data + sel[0]), value); + ASSERT_EQ(datetime::to_date_string(*(col_data + sel[0])), "2017-09-10"); + + // for has nulls + col_vector->set_no_nulls(false); + bool* is_null = reinterpret_cast(_mem_pool->allocate(size)); + memset(is_null, 0, size); + col_vector->set_is_null(is_null); + for (int i = 0; i < size; ++i) { + if (i % 2 == 0) { + is_null[i] = true; + } else { + uint24_t timestamp = datetime::to_date_timestamp(date_array[i].c_str()); + *(col_data + i) = timestamp; + } + } + + _vectorized_batch->set_size(size); + _vectorized_batch->set_selected_in_use(false); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 1); + sel = _vectorized_batch->selected(); + ASSERT_EQ(*(col_data + sel[0]), value); + ASSERT_EQ(datetime::to_date_string(*(col_data + sel[0])), "2017-09-10"); +} + +TEST_F(TestEqualPredicate, DATETIME_COLUMN) { + std::vector schema; + FieldInfo field_info; + SetFieldInfo(field_info, std::string("DATETIME_COLUMN"), OLAP_FIELD_TYPE_DATETIME, + OLAP_FIELD_AGGREGATION_REPLACE, 1, false, true); + schema.push_back(field_info); + int size = 6; + std::vector return_columns; + for (int i = 0; i < schema.size(); ++i) { + return_columns.push_back(i); + } + InitVectorizedBatch(schema, return_columns, size); + ColumnVector* col_vector = _vectorized_batch->column(0); + + // for no nulls + col_vector->set_no_nulls(true); + uint64_t* col_data = reinterpret_cast(_mem_pool->allocate(size * sizeof(uint64_t))); + col_vector->set_col_data(col_data); + + std::vector date_array; + date_array.push_back("2017-09-07 00:00:00"); + date_array.push_back("2017-09-08 00:01:00"); + date_array.push_back("2017-09-09 00:00:01"); + date_array.push_back("2017-09-10 01:00:00"); + date_array.push_back("2017-09-11 01:01:00"); + date_array.push_back("2017-09-12 01:01:01"); + for (int i = 0; i < size; ++i) { + uint64_t timestamp = datetime::to_datetime_timestamp(date_array[i].c_str()); + *(col_data + i) = timestamp; + } + + uint64_t value = datetime::to_datetime_timestamp("2017-09-10 01:00:00"); + ColumnPredicate* pred = new EqualPredicate(0, value); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 1); + uint16_t* sel = _vectorized_batch->selected(); + ASSERT_EQ(sel[0], 3); + ASSERT_EQ(*(col_data + sel[0]), value); + ASSERT_EQ(datetime::to_datetime_string(*(col_data + sel[0])), "2017-09-10 01:00:00"); + + // for has nulls + col_vector->set_no_nulls(false); + bool* is_null = reinterpret_cast(_mem_pool->allocate(size)); + memset(is_null, 0, size); + col_vector->set_is_null(is_null); + for (int i = 0; i < size; ++i) { + if (i % 2 == 0) { + is_null[i] = true; + } else { + uint64_t timestamp = datetime::to_datetime_timestamp(date_array[i].c_str()); + *(col_data + i) = timestamp; + } + } + + _vectorized_batch->set_size(size); + _vectorized_batch->set_selected_in_use(false); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 1); + sel = _vectorized_batch->selected(); + ASSERT_EQ(*(col_data + sel[0]), value); + ASSERT_EQ(datetime::to_datetime_string(*(col_data + sel[0])), "2017-09-10 01:00:00"); +} + +#define TEST_LESS_PREDICATE(TYPE, TYPE_NAME, FIELD_TYPE) \ +TEST_F(TestLessPredicate, TYPE_NAME##_COLUMN) { \ + std::vector schema; \ + FieldInfo field_info; \ + SetFieldInfo(field_info, std::string("TYPE_NAME_COLUMN"), FIELD_TYPE, \ + OLAP_FIELD_AGGREGATION_REPLACE, 1, false, true); \ + schema.push_back(field_info); \ + int size = 10; \ + std::vector return_columns; \ + for (int i = 0; i < schema.size(); ++i) { \ + return_columns.push_back(i); \ + } \ + InitVectorizedBatch(schema, return_columns, size); \ + ColumnVector* col_vector = _vectorized_batch->column(0); \ + \ + /* for no nulls */ \ + col_vector->set_no_nulls(true); \ + TYPE* col_data = reinterpret_cast(_mem_pool->allocate(size * sizeof(TYPE))); \ + col_vector->set_col_data(col_data); \ + for (int i = 0; i < size; ++i) { \ + *(col_data + i) = i; \ + } \ + TYPE value = 5; \ + ColumnPredicate* pred = new LessPredicate(0, value); \ + pred->evaluate(_vectorized_batch); \ + ASSERT_EQ(_vectorized_batch->size(), 5); \ + uint16_t* sel = _vectorized_batch->selected(); \ + TYPE sum = 0; \ + for (int i = 0; i < _vectorized_batch->size(); ++i) { \ + sum += *(col_data + sel[i]); \ + } \ + ASSERT_EQ(sum, 10); \ + \ + /* for has nulls */ \ + col_vector->set_no_nulls(false); \ + bool* is_null = reinterpret_cast(_mem_pool->allocate(size)); \ + memset(is_null, 0, size); \ + col_vector->set_is_null(is_null); \ + for (int i = 0;i < size; ++i) {\ + if (i % 2 == 0) { \ + is_null[i] = true; \ + } else { \ + *(col_data + i) = i; \ + } \ + } \ + _vectorized_batch->set_size(size); \ + _vectorized_batch->set_selected_in_use(false); \ + pred->evaluate(_vectorized_batch); \ + ASSERT_EQ(_vectorized_batch->size(), 2); \ + sel = _vectorized_batch->selected(); \ + sum = 0; \ + for (int i = 0; i < _vectorized_batch->size(); ++i) { \ + sum += *(col_data + sel[i]); \ + } \ + ASSERT_EQ(sum, 4); \ +} \ + +TEST_LESS_PREDICATE(int8_t, TINYINT, OLAP_FIELD_TYPE_TINYINT) +TEST_LESS_PREDICATE(int16_t, SMALLINT, OLAP_FIELD_TYPE_SMALLINT) +TEST_LESS_PREDICATE(int32_t, INT, OLAP_FIELD_TYPE_INT) +TEST_LESS_PREDICATE(int64_t, BIGINT, OLAP_FIELD_TYPE_BIGINT) +TEST_LESS_PREDICATE(int128_t, LARGEINT, OLAP_FIELD_TYPE_LARGEINT) + +TEST_F(TestLessPredicate, FLOAT_COLUMN) { + std::vector schema; + FieldInfo field_info; + SetFieldInfo(field_info, std::string("FLOAT_COLUMN"), OLAP_FIELD_TYPE_FLOAT, + OLAP_FIELD_AGGREGATION_REPLACE, 1, false, true); + schema.push_back(field_info); + int size = 10; + std::vector return_columns; + for (int i = 0; i < schema.size(); ++i) { + return_columns.push_back(i); + } + InitVectorizedBatch(schema, return_columns, size); + ColumnVector* col_vector = _vectorized_batch->column(0); + + // for no nulls + col_vector->set_no_nulls(true); + float* col_data = reinterpret_cast(_mem_pool->allocate(size * sizeof(float))); + col_vector->set_col_data(col_data); + for (int i = 0; i < size; ++i) { + *(col_data + i) = i; + } + float value = 5.0; + ColumnPredicate* pred = new LessPredicate(0, value); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 5); + uint16_t* sel = _vectorized_batch->selected(); + float sum = 0; + for (int i = 0; i < _vectorized_batch->size(); ++i) { + sum += *(col_data + sel[i]); + } + ASSERT_FLOAT_EQ(sum, 10.0); + + // for has nulls + col_vector->set_no_nulls(false); + bool* is_null = reinterpret_cast(_mem_pool->allocate(size)); + memset(is_null, 0, size); + col_vector->set_is_null(is_null); + for (int i = 0; i < size; ++i) { + if (i % 2 == 0) { + is_null[i] = true; + } else { + *(col_data + i) = i; + } + } + + _vectorized_batch->set_size(size); + _vectorized_batch->set_selected_in_use(false); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 2); + sel = _vectorized_batch->selected(); \ + sum = 0; + for (int i = 0; i < _vectorized_batch->size(); ++i) { + sum += *(col_data + sel[i]); + } + ASSERT_FLOAT_EQ(sum, 4.0); +} + +TEST_F(TestLessPredicate, DOUBLE_COLUMN) { + std::vector schema; + FieldInfo field_info; + SetFieldInfo(field_info, std::string("DOUBLE_COLUMN"), OLAP_FIELD_TYPE_DOUBLE, + OLAP_FIELD_AGGREGATION_REPLACE, 1, false, true); + schema.push_back(field_info); + int size = 10; + std::vector return_columns; + for (int i = 0; i < schema.size(); ++i) { + return_columns.push_back(i); + } + InitVectorizedBatch(schema, return_columns, size); + ColumnVector* col_vector = _vectorized_batch->column(0); + + // for no nulls + col_vector->set_no_nulls(true); + double* col_data = reinterpret_cast(_mem_pool->allocate(size * sizeof(double))); + col_vector->set_col_data(col_data); + for (int i = 0; i < size; ++i) { + *(col_data + i) = i; + } + double value = 5.0; + ColumnPredicate* pred = new LessPredicate(0, value); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 5); + uint16_t* sel = _vectorized_batch->selected(); + double sum = 0; + for (int i = 0; i < _vectorized_batch->size(); ++i) { + sum += *(col_data + sel[i]); + } + ASSERT_DOUBLE_EQ(sum, 10.0); + + // for has nulls + col_vector->set_no_nulls(false); + bool* is_null = reinterpret_cast(_mem_pool->allocate(size)); + memset(is_null, 0, size); + col_vector->set_is_null(is_null); + for (int i = 0; i < size; ++i) { + if (i % 2 == 0) { + is_null[i] = true; + } else { + *(col_data + i) = i; + } + } + + _vectorized_batch->set_size(size); + _vectorized_batch->set_selected_in_use(false); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 2); + sel = _vectorized_batch->selected(); \ + sum = 0; + for (int i = 0; i < _vectorized_batch->size(); ++i) { + sum += *(col_data + sel[i]); + } + ASSERT_DOUBLE_EQ(sum, 4.0); +} + +TEST_F(TestLessPredicate, DECIMAL_COLUMN) { + std::vector schema; + FieldInfo field_info; + SetFieldInfo(field_info, std::string("DECIMAL_COLUMN"), OLAP_FIELD_TYPE_DECIMAL, + OLAP_FIELD_AGGREGATION_REPLACE, 1, false, true); + schema.push_back(field_info); + int size = 10; + std::vector return_columns; + for (int i = 0; i < schema.size(); ++i) { + return_columns.push_back(i); + } + InitVectorizedBatch(schema, return_columns, size); + ColumnVector* col_vector = _vectorized_batch->column(0); + + // for no nulls + col_vector->set_no_nulls(true); + decimal12_t* col_data = reinterpret_cast(_mem_pool->allocate(size * sizeof(decimal12_t))); + col_vector->set_col_data(col_data); + for (int i = 0; i < size; ++i) { + (*(col_data + i)).integer = i; + (*(col_data + i)).fraction = i; + } + decimal12_t value(5, 5); + ColumnPredicate* pred = new LessPredicate(0, value); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 5); + uint16_t* sel = _vectorized_batch->selected(); + decimal12_t sum(0, 0); + for (int i = 0; i < _vectorized_batch->size(); ++i) { + sum += *(col_data + sel[i]); + } + ASSERT_EQ(sum.integer, 10); + ASSERT_EQ(sum.fraction, 10); + + // for has nulls + col_vector->set_no_nulls(false); + bool* is_null = reinterpret_cast(_mem_pool->allocate(size)); + memset(is_null, 0, size); + col_vector->set_is_null(is_null); + for (int i = 0; i < size; ++i) { + if (i % 2 == 0) { + is_null[i] = true; + } else { + (*(col_data + i)).integer = i; + (*(col_data + i)).fraction = i; + } + } + + _vectorized_batch->set_size(size); + _vectorized_batch->set_selected_in_use(false); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 2); + sum.integer = 0; + sum.fraction = 0; + for (int i = 0; i < _vectorized_batch->size(); ++i) { + sum += *(col_data + sel[i]); + } + ASSERT_EQ(sum.integer, 4); + ASSERT_EQ(sum.fraction, 4); +} + +TEST_F(TestLessPredicate, STRING_COLUMN) { + std::vector schema; + FieldInfo field_info; + SetFieldInfo(field_info, std::string("STRING_COLUMN"), OLAP_FIELD_TYPE_VARCHAR, + OLAP_FIELD_AGGREGATION_REPLACE, 1, false, true); + schema.push_back(field_info); + int size = 10; + std::vector return_columns; + for (int i = 0; i < schema.size(); ++i) { + return_columns.push_back(i); + } + InitVectorizedBatch(schema, return_columns, size); + ColumnVector* col_vector = _vectorized_batch->column(0); + + // for no nulls + col_vector->set_no_nulls(true); + StringValue* col_data = reinterpret_cast(_mem_pool->allocate(size * sizeof(StringValue))); + col_vector->set_col_data(col_data); + + char* string_buffer = reinterpret_cast(_mem_pool->allocate(55)); + for (int i = 0; i < size; ++i) { + for (int j = 0; j <= i; ++j) { + string_buffer[j] = 'a' + i; + } + (*(col_data + i)).len = i + 1; + (*(col_data + i)).ptr = string_buffer; + string_buffer += i + 1; + } + + StringValue value; + const char* value_buffer = "dddd"; + value.len = 4; + value.ptr = const_cast(value_buffer); + + ColumnPredicate* pred = new LessPredicate(0, value); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 3); + uint16_t* sel = _vectorized_batch->selected(); + ASSERT_TRUE(strncmp((*(col_data + sel[0])).ptr, "a", 1) == 0); + + // for has nulls + col_vector->set_no_nulls(false); + bool* is_null = reinterpret_cast(_mem_pool->allocate(size)); + memset(is_null, 0, size); + col_vector->set_is_null(is_null); + string_buffer = reinterpret_cast(_mem_pool->allocate(55)); + for (int i = 0; i < size; ++i) { + if (i % 2 == 0) { + is_null[i] = true; + } else { + for (int j = 0; j <= i; ++j) { + string_buffer[j] = 'a' + i; + } + (*(col_data + i)).len = i + 1; + (*(col_data + i)).ptr = string_buffer; + } + string_buffer += i + 1; + } + _vectorized_batch->set_size(size); + _vectorized_batch->set_selected_in_use(false); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 1); + sel = _vectorized_batch->selected(); + ASSERT_TRUE(strncmp((*(col_data + sel[0])).ptr, "bb", 2) == 0); +} + +TEST_F(TestLessPredicate, DATE_COLUMN) { + std::vector schema; + FieldInfo field_info; + SetFieldInfo(field_info, std::string("DATE_COLUMN"), OLAP_FIELD_TYPE_DATE, + OLAP_FIELD_AGGREGATION_REPLACE, 1, false, true); + schema.push_back(field_info); + int size = 6; + std::vector return_columns; + for (int i = 0; i < schema.size(); ++i) { + return_columns.push_back(i); + } + InitVectorizedBatch(schema, return_columns, size); + ColumnVector* col_vector = _vectorized_batch->column(0); + + // for no nulls + col_vector->set_no_nulls(true); + uint24_t* col_data = reinterpret_cast(_mem_pool->allocate(size * sizeof(uint24_t))); + col_vector->set_col_data(col_data); + + std::vector date_array; + date_array.push_back("2017-09-07"); + date_array.push_back("2017-09-08"); + date_array.push_back("2017-09-09"); + date_array.push_back("2017-09-10"); + date_array.push_back("2017-09-11"); + date_array.push_back("2017-09-12"); + for (int i = 0; i < size; ++i) { + uint24_t timestamp = datetime::to_date_timestamp(date_array[i].c_str()); + *(col_data + i) = timestamp; + } + + uint24_t value = datetime::to_date_timestamp("2017-09-10"); + ColumnPredicate* pred = new LessPredicate(0, value); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 3); + uint16_t* sel = _vectorized_batch->selected(); + ASSERT_EQ(datetime::to_date_string(*(col_data + sel[0])), "2017-09-07"); + + // for has nulls + col_vector->set_no_nulls(false); + bool* is_null = reinterpret_cast(_mem_pool->allocate(size)); + memset(is_null, 0, size); + col_vector->set_is_null(is_null); + for (int i = 0; i < size; ++i) { + if (i % 2 == 0) { + is_null[i] = true; + } else { + uint24_t timestamp = datetime::to_date_timestamp(date_array[i].c_str()); + *(col_data + i) = timestamp; + } + } + + _vectorized_batch->set_size(size); + _vectorized_batch->set_selected_in_use(false); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 1); + sel = _vectorized_batch->selected(); + ASSERT_EQ(datetime::to_date_string(*(col_data + sel[0])), "2017-09-08"); +} + +TEST_F(TestLessPredicate, DATETIME_COLUMN) { + std::vector schema; + FieldInfo field_info; + SetFieldInfo(field_info, std::string("DATETIME_COLUMN"), OLAP_FIELD_TYPE_DATETIME, + OLAP_FIELD_AGGREGATION_REPLACE, 1, false, true); + schema.push_back(field_info); + int size = 6; + std::vector return_columns; + for (int i = 0; i < schema.size(); ++i) { + return_columns.push_back(i); + } + InitVectorizedBatch(schema, return_columns, size); + ColumnVector* col_vector = _vectorized_batch->column(0); + + // for no nulls + col_vector->set_no_nulls(true); + uint64_t* col_data = reinterpret_cast(_mem_pool->allocate(size * sizeof(uint64_t))); + col_vector->set_col_data(col_data); + + std::vector date_array; + date_array.push_back("2017-09-07 00:00:00"); + date_array.push_back("2017-09-08 00:01:00"); + date_array.push_back("2017-09-09 00:00:01"); + date_array.push_back("2017-09-10 01:00:00"); + date_array.push_back("2017-09-11 01:01:00"); + date_array.push_back("2017-09-12 01:01:01"); + for (int i = 0; i < size; ++i) { + uint64_t timestamp = datetime::to_datetime_timestamp(date_array[i].c_str()); + *(col_data + i) = timestamp; + } + + uint64_t value = datetime::to_datetime_timestamp("2017-09-10 01:00:00"); + ColumnPredicate* pred = new LessPredicate(0, value); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 3); + uint16_t* sel = _vectorized_batch->selected(); + ASSERT_EQ(datetime::to_datetime_string(*(col_data + sel[0])), "2017-09-07 00:00:00"); + + // for has nulls + col_vector->set_no_nulls(false); + bool* is_null = reinterpret_cast(_mem_pool->allocate(size)); + memset(is_null, 0, size); + col_vector->set_is_null(is_null); + for (int i = 0; i < size; ++i) { + if (i % 2 == 0) { + is_null[i] = true; + } else { + uint64_t timestamp = datetime::to_datetime_timestamp(date_array[i].c_str()); + *(col_data + i) = timestamp; + } + } + + _vectorized_batch->set_size(size); + _vectorized_batch->set_selected_in_use(false); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 1); + sel = _vectorized_batch->selected(); + ASSERT_EQ(datetime::to_datetime_string(*(col_data + sel[0])), "2017-09-08 00:01:00"); +} + +} // namespace palo + +int main(int argc, char** argv) { + std::string conffile = std::string(getenv("PALO_HOME")) + "/conf/be.conf"; + if (!palo::config::init(conffile.c_str(), false)) { + fprintf(stderr, "error read config file. \n"); + return -1; + } + palo::init_glog("be-test"); + int ret = palo::OLAP_SUCCESS; + testing::InitGoogleTest(&argc, argv); + palo::CpuInfo::init(); + ret = RUN_ALL_TESTS(); + google::protobuf::ShutdownProtobufLibrary(); + return ret; +} diff --git a/be/test/olap/delete_handler_test.cpp b/be/test/olap/delete_handler_test.cpp index cff5c1e4b7..21c5a5687f 100644 --- a/be/test/olap/delete_handler_test.cpp +++ b/be/test/olap/delete_handler_test.cpp @@ -810,6 +810,7 @@ protected: _header_file_name = _olap_table->header_file_name(); _data_row_cursor.init(_olap_table->tablet_schema()); + _data_row_cursor.allocate_memory_for_string_type(_olap_table->tablet_schema()); } OLAPStatus push_empty_delta(int32_t version) { diff --git a/be/test/olap/file_helper_test.cpp b/be/test/olap/file_helper_test.cpp index 88abf86795..ec6b4d8ab4 100644 --- a/be/test/olap/file_helper_test.cpp +++ b/be/test/olap/file_helper_test.cpp @@ -37,14 +37,14 @@ namespace palo { class FileHandlerTest : public testing::Test { public: // create a mock cgroup folder - static void SetUpTestCase() { + virtual void SetUp() { ASSERT_FALSE(boost::filesystem::exists(_s_test_data_path)); // create a mock cgroup path ASSERT_TRUE(boost::filesystem::create_directory(_s_test_data_path)); } // delete the mock cgroup folder - static void TearDownTestCase() { + virtual void TearDown() { ASSERT_TRUE(boost::filesystem::remove_all(_s_test_data_path)); } @@ -52,7 +52,7 @@ public: static std::string _s_test_data_path; }; -std::string FileHandlerTest::_s_test_data_path = "./file_handler_testxxxx123"; +std::string FileHandlerTest::_s_test_data_path = "./log/file_handler_testxxxx123"; TEST_F(FileHandlerTest, TestWrite) { FileHandler file_handler; @@ -101,6 +101,11 @@ TEST_F(FileHandlerTest, TestWrite) { } // namespace palo int main(int argc, char **argv) { + std::string conffile = std::string(getenv("PALO_HOME")) + "/conf/be.conf"; + if (!palo::config::init(conffile.c_str(), false)) { + fprintf(stderr, "error read config file. \n"); + return -1; + } palo::init_glog("be-test"); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/be/test/olap/file_utils_test.cpp b/be/test/olap/file_utils_test.cpp index 6221081bda..136ba267be 100644 --- a/be/test/olap/file_utils_test.cpp +++ b/be/test/olap/file_utils_test.cpp @@ -38,14 +38,14 @@ namespace palo { class FileUtilsTest : public testing::Test { public: // create a mock cgroup folder - static void SetUpTestCase() { + virtual void SetUp() { ASSERT_FALSE(boost::filesystem::exists(_s_test_data_path)); // create a mock cgroup path ASSERT_TRUE(boost::filesystem::create_directory(_s_test_data_path)); } // delete the mock cgroup folder - static void TearDownTestCase() { + virtual void TearDown() { ASSERT_TRUE(boost::filesystem::remove_all(_s_test_data_path)); } @@ -87,6 +87,11 @@ TEST_F(FileUtilsTest, TestCopyFile) { } // namespace palo int main(int argc, char **argv) { + std::string conffile = std::string(getenv("PALO_HOME")) + "/conf/be.conf"; + if (!palo::config::init(conffile.c_str(), false)) { + fprintf(stderr, "error read config file. \n"); + return -1; + } palo::init_glog("be-test"); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/be/test/olap/in_list_predicate_test.cpp b/be/test/olap/in_list_predicate_test.cpp new file mode 100644 index 0000000000..daa0f4a495 --- /dev/null +++ b/be/test/olap/in_list_predicate_test.cpp @@ -0,0 +1,683 @@ +// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "olap/field.h" +#include "olap/column_predicate.h" +#include "olap/in_list_predicate.h" +#include "runtime/mem_pool.h" +#include "runtime/string_value.hpp" +#include "runtime/vectorized_row_batch.h" +#include "util/logging.h" + +namespace palo { + +namespace datetime { + +static uint24_t timestamp_from_date(const char* date_string) { + tm time_tm; + strptime(date_string, "%Y-%m-%d", &time_tm); + + int value = (time_tm.tm_year + 1900) * 16 * 32 + + (time_tm.tm_mon + 1) * 32 + + time_tm.tm_mday; + return uint24_t(value); +} + +static uint64_t timestamp_from_datetime(const std::string& value_string) { + tm time_tm; + strptime(value_string.c_str(), "%Y-%m-%d %H:%M:%S", &time_tm); + + uint64_t value = ((time_tm.tm_year + 1900) * 10000L + + (time_tm.tm_mon + 1) * 100L + + time_tm.tm_mday) * 1000000L + + time_tm.tm_hour * 10000L + + time_tm.tm_min * 100L + + time_tm.tm_sec; + + return value; +} + +static std::string to_date_string(uint24_t& date_value) { + tm time_tm; + int value = date_value; + memset(&time_tm, 0, sizeof(time_tm)); + time_tm.tm_mday = static_cast(value & 31); + time_tm.tm_mon = static_cast(value >> 5 & 15) - 1; + time_tm.tm_year = static_cast(value >> 9) - 1900; + char buf[20] = {'\0'}; + strftime(buf, sizeof(buf), "%Y-%m-%d", &time_tm); + return std::string(buf); +} + +static std::string to_datetime_string(uint64_t& datetime_value) { + tm time_tm; + int64_t part1 = (datetime_value / 1000000L); + int64_t part2 = (datetime_value - part1 * 1000000L); + + time_tm.tm_year = static_cast((part1 / 10000L) % 10000) - 1900; + time_tm.tm_mon = static_cast((part1 / 100) % 100) - 1; + time_tm.tm_mday = static_cast(part1 % 100); + + time_tm.tm_hour = static_cast((part2 / 10000L) % 10000); + time_tm.tm_min = static_cast((part2 / 100) % 100); + time_tm.tm_sec = static_cast(part2 % 100); + + char buf[20] = {'\0'}; + strftime(buf, 20, "%Y-%m-%d %H:%M:%S", &time_tm); + return std::string(buf); +} + +}; + +class TestInListPredicate : public testing::Test { +public: + TestInListPredicate() : _vectorized_batch(NULL) { + _mem_tracker.reset(new MemTracker(-1)); + _mem_pool.reset(new MemPool(_mem_tracker.get())); + } + + ~TestInListPredicate() { + if (_vectorized_batch != NULL) { + delete _vectorized_batch; + } + } + + void SetFieldInfo(FieldInfo &field_info, std::string name, + FieldType type, FieldAggregationMethod aggregation, + uint32_t length, bool is_allow_null, bool is_key) { + field_info.name = name; + field_info.type = type; + field_info.aggregation = aggregation; + field_info.length = length; + field_info.is_allow_null = is_allow_null; + field_info.is_key = is_key; + field_info.precision = 1000; + field_info.frac = 10000; + field_info.unique_id = 0; + field_info.is_bf_column = false; + } + + void InitVectorizedBatch(const std::vector& schema, + const std::vector& ids, + int size) { + _vectorized_batch = new VectorizedRowBatch(schema, ids, size); + _vectorized_batch->set_size(size); + } + std::unique_ptr _mem_tracker; + std::unique_ptr _mem_pool; + VectorizedRowBatch* _vectorized_batch; +}; + +#define TEST_IN_LIST_PREDICATE(TYPE, TYPE_NAME, FIELD_TYPE) \ +TEST_F(TestInListPredicate, TYPE_NAME##_COLUMN) { \ + std::vector schema; \ + FieldInfo field_info; \ + SetFieldInfo(field_info, std::string("TYPE_NAME##_COLUMN"), FIELD_TYPE, \ + OLAP_FIELD_AGGREGATION_REPLACE, 1, false, true); \ + schema.push_back(field_info); \ + int size = 10; \ + std::vector return_columns; \ + for (int i = 0; i < schema.size(); ++i) { \ + return_columns.push_back(i); \ + } \ + InitVectorizedBatch(schema, return_columns, size); \ + ColumnVector* col_vector = _vectorized_batch->column(0); \ + \ + /* for no nulls */ \ + col_vector->set_no_nulls(true); \ + TYPE* col_data = reinterpret_cast(_mem_pool->allocate(size * sizeof(TYPE))); \ + col_vector->set_col_data(col_data); \ + for (int i = 0; i < size; ++i) { \ + *(col_data + i) = i; \ + } \ + \ + std::set values; \ + values.insert(4); \ + values.insert(5); \ + values.insert(6); \ + ColumnPredicate* pred = new InListPredicate(0, std::move(values)); \ + pred->evaluate(_vectorized_batch); \ + ASSERT_EQ(_vectorized_batch->size(), 3); \ + uint16_t* sel = _vectorized_batch->selected(); \ + ASSERT_EQ(*(col_data + sel[0]), 4); \ + ASSERT_EQ(*(col_data + sel[1]), 5); \ + ASSERT_EQ(*(col_data + sel[2]), 6); \ + \ + /* for has nulls */ \ + col_vector->set_no_nulls(false); \ + bool* is_null = reinterpret_cast(_mem_pool->allocate(size)); \ + memset(is_null, 0, size); \ + col_vector->set_is_null(is_null); \ + for (int i = 0; i < size; ++i) { \ + if (i % 2 == 0) { \ + is_null[i] = true; \ + } else { \ + *(col_data + i) = i; \ + } \ + } \ + _vectorized_batch->set_size(size); \ + _vectorized_batch->set_selected_in_use(false); \ + pred->evaluate(_vectorized_batch); \ + ASSERT_EQ(_vectorized_batch->size(), 1); \ + sel = _vectorized_batch->selected(); \ + ASSERT_EQ(*(col_data + sel[0]), 5); \ +} \ + +TEST_IN_LIST_PREDICATE(int8_t, TINYINT, OLAP_FIELD_TYPE_TINYINT) +TEST_IN_LIST_PREDICATE(int16_t, SMALLINT, OLAP_FIELD_TYPE_SMALLINT) +TEST_IN_LIST_PREDICATE(int32_t, INT, OLAP_FIELD_TYPE_INT) +TEST_IN_LIST_PREDICATE(int64_t, BIGINT, OLAP_FIELD_TYPE_BIGINT) +TEST_IN_LIST_PREDICATE(int128_t, LARGEINT, OLAP_FIELD_TYPE_LARGEINT) + +TEST_F(TestInListPredicate, FLOAT_COLUMN) { + std::vector schema; + FieldInfo field_info; + SetFieldInfo(field_info, std::string("FLOAT_COLUMN"), OLAP_FIELD_TYPE_FLOAT, + OLAP_FIELD_AGGREGATION_REPLACE, 1, false, true); + schema.push_back(field_info); + int size = 10; + std::vector return_columns; + for (int i = 0; i < schema.size(); ++i) { + return_columns.push_back(i); + } + InitVectorizedBatch(schema, return_columns, size); + ColumnVector* col_vector = _vectorized_batch->column(0); + + // for no nulls + col_vector->set_no_nulls(true); + float* col_data = reinterpret_cast(_mem_pool->allocate(size * sizeof(float))); + col_vector->set_col_data(col_data); + for (int i = 0; i < size; ++i) { + *(col_data + i) = i + 0.1; + } + std::set values; + values.insert(4.1); + values.insert(5.1); + values.insert(6.1); + ColumnPredicate* pred = new InListPredicate(0, std::move(values)); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 3); + uint16_t* sel = _vectorized_batch->selected(); + ASSERT_FLOAT_EQ(*(col_data + sel[0]), 4.1); + ASSERT_FLOAT_EQ(*(col_data + sel[1]), 5.1); + ASSERT_FLOAT_EQ(*(col_data + sel[2]), 6.1); + + // for has nulls + col_vector->set_no_nulls(false); + bool* is_null = reinterpret_cast(_mem_pool->allocate(size)); + memset(is_null, 0, size); + col_vector->set_is_null(is_null); + for (int i = 0; i < size; ++i) { + if (i % 2 == 0) { + is_null[i] = true; + } else { + *(col_data + i) = i + 0.1; + } + } + _vectorized_batch->set_size(size); + _vectorized_batch->set_selected_in_use(false); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 1); + sel = _vectorized_batch->selected(); + ASSERT_FLOAT_EQ(*(col_data + sel[0]), 5.1); +} + +TEST_F(TestInListPredicate, DOUBLE_COLUMN) { + std::vector schema; + FieldInfo field_info; + SetFieldInfo(field_info, std::string("DOUBLE_COLUMN"), OLAP_FIELD_TYPE_DOUBLE, + OLAP_FIELD_AGGREGATION_REPLACE, 1, false, true); + schema.push_back(field_info); + int size = 10; + std::vector return_columns; + for (int i = 0; i < schema.size(); ++i) { + return_columns.push_back(i); + } + InitVectorizedBatch(schema, return_columns, size); + ColumnVector* col_vector = _vectorized_batch->column(0); + + // for no nulls + col_vector->set_no_nulls(true); + double* col_data = reinterpret_cast(_mem_pool->allocate(size * sizeof(double))); + col_vector->set_col_data(col_data); + for (int i = 0; i < size; ++i) { + *(col_data + i) = i + 0.1; + } + std::set values; + values.insert(4.1); + values.insert(5.1); + values.insert(6.1); + + ColumnPredicate* pred = new InListPredicate(0, std::move(values)); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 3); + uint16_t* sel = _vectorized_batch->selected(); + ASSERT_DOUBLE_EQ(*(col_data + sel[0]), 4.1); + ASSERT_DOUBLE_EQ(*(col_data + sel[1]), 5.1); + ASSERT_DOUBLE_EQ(*(col_data + sel[2]), 6.1); + + // for has nulls + col_vector->set_no_nulls(false); + bool* is_null = reinterpret_cast(_mem_pool->allocate(size)); + memset(is_null, 0, size); + col_vector->set_is_null(is_null); + for (int i = 0; i < size; ++i) { + if (i % 2 == 0) { + is_null[i] = true; + } else { + *(col_data + i) = i + 0.1; + } + } + _vectorized_batch->set_size(size); + _vectorized_batch->set_selected_in_use(false); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 1); + sel = _vectorized_batch->selected(); + ASSERT_DOUBLE_EQ(*(col_data + sel[0]), 5.1); +} + +TEST_F(TestInListPredicate, DECIMAL_COLUMN) { + std::vector schema; + FieldInfo field_info; + SetFieldInfo(field_info, std::string("DECIMAL_COLUMN"), OLAP_FIELD_TYPE_DECIMAL, + OLAP_FIELD_AGGREGATION_REPLACE, 1, false, true); + schema.push_back(field_info); + int size = 10; + std::vector return_columns; + for (int i = 0; i < schema.size(); ++i) { + return_columns.push_back(i); + } + InitVectorizedBatch(schema, return_columns, size); + ColumnVector* col_vector = _vectorized_batch->column(0); + + // for no nulls + col_vector->set_no_nulls(true); + decimal12_t* col_data = reinterpret_cast(_mem_pool->allocate(size * sizeof(decimal12_t))); + col_vector->set_col_data(col_data); + for (int i = 0; i < size; ++i) { + (*(col_data + i)).integer = i; + (*(col_data + i)).fraction = i; + } + + std::set values; + decimal12_t value1(4, 4); + values.insert(value1); + + decimal12_t value2(5, 5); + values.insert(value2); + + decimal12_t value3(6, 6); + values.insert(value3); + + ColumnPredicate* pred = new InListPredicate(0, std::move(values)); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 3); + uint16_t* sel = _vectorized_batch->selected(); + ASSERT_EQ(*(col_data + sel[0]), value1); + ASSERT_EQ(*(col_data + sel[1]), value2); + ASSERT_EQ(*(col_data + sel[2]), value3); + + // for has nulls + col_vector->set_no_nulls(false); + bool* is_null = reinterpret_cast(_mem_pool->allocate(size)); + memset(is_null, 0, size); + col_vector->set_is_null(is_null); + for (int i = 0; i < size; ++i) { + if (i % 2 == 0) { + is_null[i] = true; + } else { + (*(col_data + i)).integer = i; + (*(col_data + i)).fraction = i; + } + } + + _vectorized_batch->set_size(size); + _vectorized_batch->set_selected_in_use(false); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 1); + sel = _vectorized_batch->selected(); + ASSERT_EQ(*(col_data + sel[0]), value2); +} + +TEST_F(TestInListPredicate, CHAR_COLUMN) { + std::vector schema; + FieldInfo field_info; + SetFieldInfo(field_info, std::string("STRING_COLUMN"), OLAP_FIELD_TYPE_CHAR, + OLAP_FIELD_AGGREGATION_REPLACE, 1, false, true); + schema.push_back(field_info); + int size = 10; + std::vector return_columns; + for (int i = 0; i < schema.size(); ++i) { + return_columns.push_back(i); + } + InitVectorizedBatch(schema, return_columns, size); + ColumnVector* col_vector = _vectorized_batch->column(0); + + // for no nulls + col_vector->set_no_nulls(true); + StringValue* col_data = reinterpret_cast(_mem_pool->allocate(size * sizeof(StringValue))); + col_vector->set_col_data(col_data); + + char* string_buffer = reinterpret_cast(_mem_pool->allocate(50)); + memset(string_buffer, 0, 50); + for (int i = 0; i < size; ++i) { + for (int j = 0; j <= 5; ++j) { + string_buffer[j] = 'a' + i; + } + (*(col_data + i)).len = 5; + (*(col_data + i)).ptr = string_buffer; + string_buffer += 5; + } + + std::set values; + StringValue value1; + const char* value1_buffer = "aaaaa"; + value1.ptr = const_cast(value1_buffer); + value1.len = 5; + values.insert(value1); + + StringValue value2; + const char* value2_buffer = "bbbbb"; + value2.ptr = const_cast(value2_buffer); + value2.len = 5; + values.insert(value2); + + StringValue value3; + const char* value3_buffer = "ccccc"; + value3.ptr = const_cast(value3_buffer); + value3.len = 5; + values.insert(value3); + + ColumnPredicate* pred = new InListPredicate(0, std::move(values)); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 3); + uint16_t* sel = _vectorized_batch->selected(); + ASSERT_EQ(*(col_data + sel[0]), value1); + ASSERT_EQ(*(col_data + sel[1]), value2); + ASSERT_EQ(*(col_data + sel[2]), value3); + + // for has nulls + col_vector->set_no_nulls(false); + bool* is_null = reinterpret_cast(_mem_pool->allocate(size)); + memset(is_null, 0, size); + col_vector->set_is_null(is_null); + string_buffer = reinterpret_cast(_mem_pool->allocate(50)); + memset(string_buffer, 0, 50); + for (int i = 0; i < size; ++i) { + if (i % 2 == 0) { + is_null[i] = true; + } else { + for (int j = 0; j <= 5; ++j) { + string_buffer[j] = 'a' + i; + } + (*(col_data + i)).len = 5; + (*(col_data + i)).ptr = string_buffer; + } + string_buffer += 5; + } + + _vectorized_batch->set_size(size); + _vectorized_batch->set_selected_in_use(false); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 1); + sel = _vectorized_batch->selected(); + ASSERT_EQ(*(col_data + sel[0]), value2); +} + +TEST_F(TestInListPredicate, VARCHAR_COLUMN) { + std::vector schema; + FieldInfo field_info; + SetFieldInfo(field_info, std::string("STRING_COLUMN"), OLAP_FIELD_TYPE_VARCHAR, + OLAP_FIELD_AGGREGATION_REPLACE, 1, false, true); + schema.push_back(field_info); + int size = 10; + std::vector return_columns; + for (int i = 0; i < schema.size(); ++i) { + return_columns.push_back(i); + } + InitVectorizedBatch(schema, return_columns, size); + ColumnVector* col_vector = _vectorized_batch->column(0); + + // for no nulls + col_vector->set_no_nulls(true); + StringValue* col_data = reinterpret_cast(_mem_pool->allocate(size * sizeof(StringValue))); + col_vector->set_col_data(col_data); + + char* string_buffer = reinterpret_cast(_mem_pool->allocate(55)); + for (int i = 0; i < size; ++i) { + for (int j = 0; j <= i; ++j) { + string_buffer[j] = 'a' + i; + } + (*(col_data + i)).len = i + 1; + (*(col_data + i)).ptr = string_buffer; + string_buffer += i + 1; + } + + std::set values; + StringValue value1; + const char* value1_buffer = "a"; + value1.ptr = const_cast(value1_buffer); + value1.len = 1; + values.insert(value1); + + StringValue value2; + const char* value2_buffer = "bb"; + value2.ptr = const_cast(value2_buffer); + value2.len = 2; + values.insert(value2); + + StringValue value3; + const char* value3_buffer = "ccc"; + value3.ptr = const_cast(value3_buffer); + value3.len = 3; + values.insert(value3); + + ColumnPredicate* pred = new InListPredicate(0, std::move(values)); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 3); + uint16_t* sel = _vectorized_batch->selected(); + ASSERT_EQ(*(col_data + sel[0]), value1); + ASSERT_EQ(*(col_data + sel[1]), value2); + ASSERT_EQ(*(col_data + sel[2]), value3); + + // for has nulls + col_vector->set_no_nulls(false); + bool* is_null = reinterpret_cast(_mem_pool->allocate(size)); + memset(is_null, 0, size); + col_vector->set_is_null(is_null); + string_buffer = reinterpret_cast(_mem_pool->allocate(55)); + for (int i = 0; i < size; ++i) { + if (i % 2 == 0) { + is_null[i] = true; + } else { + for (int j = 0; j <= i; ++j) { + string_buffer[j] = 'a' + i; + } + (*(col_data + i)).len = i + 1; + (*(col_data + i)).ptr = string_buffer; + } + string_buffer += i + 1; + } + + _vectorized_batch->set_size(size); + _vectorized_batch->set_selected_in_use(false); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 1); + sel = _vectorized_batch->selected(); + ASSERT_EQ(*(col_data + sel[0]), value2); +} + +TEST_F(TestInListPredicate, DATE_COLUMN) { + std::vector schema; + FieldInfo field_info; + SetFieldInfo(field_info, std::string("DATE_COLUMN"), OLAP_FIELD_TYPE_DATE, + OLAP_FIELD_AGGREGATION_REPLACE, 1, false, true); + schema.push_back(field_info); + int size = 6; + std::vector return_columns; + for (int i = 0; i < schema.size(); ++i) { + return_columns.push_back(i); + } + InitVectorizedBatch(schema, return_columns, size); + ColumnVector* col_vector = _vectorized_batch->column(0); + + // for no nulls + col_vector->set_no_nulls(true); + uint24_t* col_data = reinterpret_cast(_mem_pool->allocate(size * sizeof(uint24_t))); + col_vector->set_col_data(col_data); + + std::vector date_array; + date_array.push_back("2017-09-07"); + date_array.push_back("2017-09-08"); + date_array.push_back("2017-09-09"); + date_array.push_back("2017-09-10"); + date_array.push_back("2017-09-11"); + date_array.push_back("2017-09-12"); + for (int i = 0; i < size; ++i) { + uint24_t timestamp = datetime::timestamp_from_date(date_array[i].c_str()); + *(col_data + i) = timestamp; + } + + std::set values; + uint24_t value1 = datetime::timestamp_from_date("2017-09-09"); + values.insert(value1); + + uint24_t value2 = datetime::timestamp_from_date("2017-09-10"); + values.insert(value2); + + uint24_t value3 = datetime::timestamp_from_date("2017-09-11"); + values.insert(value3); + + ColumnPredicate* pred = new InListPredicate(0, std::move(values)); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 3); + uint16_t* sel = _vectorized_batch->selected(); + ASSERT_EQ(datetime::to_date_string(*(col_data + sel[0])), "2017-09-09"); + ASSERT_EQ(datetime::to_date_string(*(col_data + sel[1])), "2017-09-10"); + ASSERT_EQ(datetime::to_date_string(*(col_data + sel[2])), "2017-09-11"); + + // for has nulls + col_vector->set_no_nulls(false); + bool* is_null = reinterpret_cast(_mem_pool->allocate(size)); + memset(is_null, 0, size); + col_vector->set_is_null(is_null); + for (int i = 0; i < size; ++i) { + if (i % 2 == 0) { + is_null[i] = true; + } else { + uint24_t timestamp = datetime::timestamp_from_date(date_array[i].c_str()); + *(col_data + i) = timestamp; + } + } + + _vectorized_batch->set_size(size); + _vectorized_batch->set_selected_in_use(false); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 1); + sel = _vectorized_batch->selected(); + ASSERT_EQ(datetime::to_date_string(*(col_data + sel[0])), "2017-09-10"); +} + +TEST_F(TestInListPredicate, DATETIME_COLUMN) { + std::vector schema; + FieldInfo field_info; + SetFieldInfo(field_info, std::string("DATETIME_COLUMN"), OLAP_FIELD_TYPE_DATETIME, + OLAP_FIELD_AGGREGATION_REPLACE, 1, false, true); + schema.push_back(field_info); + int size = 6; + std::vector return_columns; + for (int i = 0; i < schema.size(); ++i) { + return_columns.push_back(i); + } + InitVectorizedBatch(schema, return_columns, size); + ColumnVector* col_vector = _vectorized_batch->column(0); + + // for no nulls + col_vector->set_no_nulls(true); + uint64_t* col_data = reinterpret_cast(_mem_pool->allocate(size * sizeof(uint64_t))); + col_vector->set_col_data(col_data); + + std::vector date_array; + date_array.push_back("2017-09-07 00:00:00"); + date_array.push_back("2017-09-08 00:01:00"); + date_array.push_back("2017-09-09 00:00:01"); + date_array.push_back("2017-09-10 01:00:00"); + date_array.push_back("2017-09-11 01:01:00"); + date_array.push_back("2017-09-12 01:01:01"); + for (int i = 0; i < size; ++i) { + uint64_t timestamp = datetime::timestamp_from_datetime(date_array[i].c_str()); + *(col_data + i) = timestamp; + } + + std::set values; + uint64_t value1 = datetime::timestamp_from_datetime("2017-09-09 00:00:01"); + values.insert(value1); + + uint64_t value2 = datetime::timestamp_from_datetime("2017-09-10 01:00:00"); + values.insert(value2); + + uint64_t value3 = datetime::timestamp_from_datetime("2017-09-11 01:01:00"); + values.insert(value3); + + ColumnPredicate* pred = new InListPredicate(0, std::move(values)); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 3); + uint16_t* sel = _vectorized_batch->selected(); + ASSERT_EQ(datetime::to_datetime_string(*(col_data + sel[0])), "2017-09-09 00:00:01"); + ASSERT_EQ(datetime::to_datetime_string(*(col_data + sel[1])), "2017-09-10 01:00:00"); + ASSERT_EQ(datetime::to_datetime_string(*(col_data + sel[2])), "2017-09-11 01:01:00"); + + // for has nulls + col_vector->set_no_nulls(false); + bool* is_null = reinterpret_cast(_mem_pool->allocate(size)); + memset(is_null, 0, size); + col_vector->set_is_null(is_null); + for (int i = 0; i < size; ++i) { + if (i % 2 == 0) { + is_null[i] = true; + } else { + uint64_t timestamp = datetime::timestamp_from_datetime(date_array[i].c_str()); + *(col_data + i) = timestamp; + } + } + + _vectorized_batch->set_size(size); + _vectorized_batch->set_selected_in_use(false); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 1); + sel = _vectorized_batch->selected(); + ASSERT_EQ(datetime::to_datetime_string(*(col_data + sel[0])), "2017-09-10 01:00:00"); +} + +} // namespace palo + +int main(int argc, char** argv) { + std::string conffile = std::string(getenv("PALO_HOME")) + "/conf/be.conf"; + if (!palo::config::init(conffile.c_str(), false)) { + fprintf(stderr, "error read config file. \n"); + return -1; + } + palo::init_glog("be-test"); + int ret = palo::OLAP_SUCCESS; + testing::InitGoogleTest(&argc, argv); + palo::CpuInfo::init(); + ret = RUN_ALL_TESTS(); + google::protobuf::ShutdownProtobufLibrary(); + return ret; +} diff --git a/be/test/olap/mock_command_executor.h b/be/test/olap/mock_command_executor.h index 482881f209..8b403fa2e1 100644 --- a/be/test/olap/mock_command_executor.h +++ b/be/test/olap/mock_command_executor.h @@ -64,7 +64,7 @@ public: OLAPStatus(const TPushReq& request, std::vector* tablet_info_vec)); MOCK_METHOD1(cancel_delete, OLAPStatus(const TCancelDeleteDataReq& request)); MOCK_METHOD3( - base_expansion, + base_compaction, OLAPStatus(TTabletId tablet_id, TSchemaHash schema_hash, TVersion version)); MOCK_METHOD4( update_header, diff --git a/be/test/olap/null_predicate_test.cpp b/be/test/olap/null_predicate_test.cpp new file mode 100644 index 0000000000..ac160a9ad6 --- /dev/null +++ b/be/test/olap/null_predicate_test.cpp @@ -0,0 +1,464 @@ +// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "olap/field.h" +#include "olap/column_predicate.h" +#include "olap/null_predicate.h" +#include "runtime/mem_pool.h" +#include "runtime/string_value.hpp" +#include "runtime/vectorized_row_batch.h" +#include "util/logging.h" + +namespace palo { + +namespace datetime { + +static uint24_t to_date_timestamp(const char* date_string) { + tm time_tm; + strptime(date_string, "%Y-%m-%d", &time_tm); + + int value = (time_tm.tm_year + 1900) * 16 * 32 + + (time_tm.tm_mon + 1) * 32 + + time_tm.tm_mday; + return uint24_t(value); +} + +static uint64_t to_datetime_timestamp(const std::string& value_string) { + tm time_tm; + strptime(value_string.c_str(), "%Y-%m-%d %H:%M:%S", &time_tm); + + uint64_t value = ((time_tm.tm_year + 1900) * 10000L + + (time_tm.tm_mon + 1) * 100L + + time_tm.tm_mday) * 1000000L + + time_tm.tm_hour * 10000L + + time_tm.tm_min * 100L + + time_tm.tm_sec; + + return value; +} + +}; + +class TestNullPredicate : public testing::Test { +public: + TestNullPredicate() : _vectorized_batch(NULL) { + _mem_tracker.reset(new MemTracker(-1)); + _mem_pool.reset(new MemPool(_mem_tracker.get())); + } + + ~TestNullPredicate() { + if (_vectorized_batch != NULL) { + delete _vectorized_batch; + } + } + + void SetFieldInfo(FieldInfo &field_info, std::string name, + FieldType type, FieldAggregationMethod aggregation, + uint32_t length, bool is_allow_null, bool is_key) { + field_info.name = name; + field_info.type = type; + field_info.aggregation = aggregation; + field_info.length = length; + field_info.is_allow_null = is_allow_null; + field_info.is_key = is_key; + field_info.precision = 1000; + field_info.frac = 10000; + field_info.unique_id = 0; + field_info.is_bf_column = false; + } + + void InitVectorizedBatch(const std::vector& schema, + const std::vector&ids, + int size) { + _vectorized_batch = new VectorizedRowBatch(schema, ids, size); + _vectorized_batch->set_size(size); + } + std::unique_ptr _mem_tracker; + std::unique_ptr _mem_pool; + VectorizedRowBatch* _vectorized_batch; +}; + +#define TEST_IN_LIST_PREDICATE(TYPE, TYPE_NAME, FIELD_TYPE) \ +TEST_F(TestNullPredicate, TYPE_NAME##_COLUMN) { \ + std::vector schema; \ + FieldInfo field_info; \ + SetFieldInfo(field_info, std::string("TYPE_NAME##_COLUMN"), FIELD_TYPE, \ + OLAP_FIELD_AGGREGATION_REPLACE, 1, false, true); \ + schema.push_back(field_info); \ + int size = 10; \ + std::vector return_columns; \ + for (int i = 0; i < schema.size(); ++i) { \ + return_columns.push_back(i); \ + } \ + InitVectorizedBatch(schema, return_columns, size); \ + ColumnVector* col_vector = _vectorized_batch->column(0); \ + \ + /* for no nulls */ \ + col_vector->set_no_nulls(true); \ + TYPE* col_data = reinterpret_cast(_mem_pool->allocate(size * sizeof(TYPE))); \ + col_vector->set_col_data(col_data); \ + for (int i = 0; i < size; ++i) { \ + *(col_data + i) = i; \ + } \ + \ + ColumnPredicate* pred = new NullPredicate(0, true); \ + pred->evaluate(_vectorized_batch); \ + ASSERT_EQ(_vectorized_batch->size(), 0); \ + \ + /* for has nulls */ \ + col_vector->set_no_nulls(false); \ + bool* is_null = reinterpret_cast(_mem_pool->allocate(size)); \ + memset(is_null, 0, size); \ + col_vector->set_is_null(is_null); \ + for (int i = 0; i < size; ++i) { \ + if (i % 2 == 0) { \ + is_null[i] = true; \ + } else { \ + *(col_data + i) = i; \ + } \ + } \ + _vectorized_batch->set_size(size); \ + _vectorized_batch->set_selected_in_use(false); \ + pred->evaluate(_vectorized_batch); \ + ASSERT_EQ(_vectorized_batch->size(), 5); \ +} \ + +TEST_IN_LIST_PREDICATE(int8_t, TINYINT, OLAP_FIELD_TYPE_TINYINT) +TEST_IN_LIST_PREDICATE(int16_t, SMALLINT, OLAP_FIELD_TYPE_SMALLINT) +TEST_IN_LIST_PREDICATE(int32_t, INT, OLAP_FIELD_TYPE_INT) +TEST_IN_LIST_PREDICATE(int64_t, BIGINT, OLAP_FIELD_TYPE_BIGINT) +TEST_IN_LIST_PREDICATE(int128_t, LARGEINT, OLAP_FIELD_TYPE_LARGEINT) + +TEST_F(TestNullPredicate, FLOAT_COLUMN) { + std::vector schema; + FieldInfo field_info; + SetFieldInfo(field_info, std::string("FLOAT_COLUMN"), OLAP_FIELD_TYPE_FLOAT, + OLAP_FIELD_AGGREGATION_REPLACE, 1, false, true); + schema.push_back(field_info); + int size = 10; + std::vector return_columns; + for (int i = 0; i < schema.size(); ++i) { + return_columns.push_back(i); + } + InitVectorizedBatch(schema, return_columns, size); + ColumnVector* col_vector = _vectorized_batch->column(0); + + // for no nulls + col_vector->set_no_nulls(true); + float* col_data = reinterpret_cast(_mem_pool->allocate(size * sizeof(float))); + col_vector->set_col_data(col_data); + for (int i = 0; i < size; ++i) { + *(col_data + i) = i + 0.1; + } + ColumnPredicate* pred = new NullPredicate(0, true); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 0); + + // for has nulls + col_vector->set_no_nulls(false); + bool* is_null = reinterpret_cast(_mem_pool->allocate(size)); + memset(is_null, 0, size); + col_vector->set_is_null(is_null); + for (int i = 0; i < size; ++i) { + if (i % 2 == 0) { + is_null[i] = true; + } else { + *(col_data + i) = i + 0.1; + } + } + _vectorized_batch->set_size(size); + _vectorized_batch->set_selected_in_use(false); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 5); +} + +TEST_F(TestNullPredicate, DOUBLE_COLUMN) { + std::vector schema; + FieldInfo field_info; + SetFieldInfo(field_info, std::string("DOUBLE_COLUMN"), OLAP_FIELD_TYPE_DOUBLE, + OLAP_FIELD_AGGREGATION_REPLACE, 1, false, true); + schema.push_back(field_info); + int size = 10; + std::vector return_columns; + for (int i = 0; i < schema.size(); ++i) { + return_columns.push_back(i); + } + InitVectorizedBatch(schema, return_columns, size); + ColumnVector* col_vector = _vectorized_batch->column(0); + + // for no nulls + col_vector->set_no_nulls(true); + double* col_data = reinterpret_cast(_mem_pool->allocate(size * sizeof(double))); + col_vector->set_col_data(col_data); + for (int i = 0; i < size; ++i) { + *(col_data + i) = i + 0.1; + } + + ColumnPredicate* pred = new NullPredicate(0, true); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 0); + + // for has nulls + col_vector->set_no_nulls(false); + bool* is_null = reinterpret_cast(_mem_pool->allocate(size)); + memset(is_null, 0, size); + col_vector->set_is_null(is_null); + for (int i = 0; i < size; ++i) { + if (i % 2 == 0) { + is_null[i] = true; + } else { + *(col_data + i) = i + 0.1; + } + } + _vectorized_batch->set_size(size); + _vectorized_batch->set_selected_in_use(false); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 5); +} + +TEST_F(TestNullPredicate, DECIMAL_COLUMN) { + std::vector schema; + FieldInfo field_info; + SetFieldInfo(field_info, std::string("DECIMAL_COLUMN"), OLAP_FIELD_TYPE_DECIMAL, + OLAP_FIELD_AGGREGATION_REPLACE, 1, false, true); + schema.push_back(field_info); + int size = 10; + std::vector return_columns; + for (int i = 0; i < schema.size(); ++i) { + return_columns.push_back(i); + } + InitVectorizedBatch(schema, return_columns, size); + ColumnVector* col_vector = _vectorized_batch->column(0); + + // for no nulls + col_vector->set_no_nulls(true); + decimal12_t* col_data = reinterpret_cast(_mem_pool->allocate(size * sizeof(decimal12_t))); + col_vector->set_col_data(col_data); + for (int i = 0; i < size; ++i) { + (*(col_data + i)).integer = i; + (*(col_data + i)).fraction = i; + } + + ColumnPredicate* pred = new NullPredicate(0, true); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 0); + + // for has nulls + col_vector->set_no_nulls(false); + bool* is_null = reinterpret_cast(_mem_pool->allocate(size)); + memset(is_null, 0, size); + col_vector->set_is_null(is_null); + for (int i = 0; i < size; ++i) { + if (i % 3 == 0) { + is_null[i] = true; + } else { + (*(col_data + i)).integer = i; + (*(col_data + i)).fraction = i; + } + } + + _vectorized_batch->set_size(size); + _vectorized_batch->set_selected_in_use(false); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 4); +} + +TEST_F(TestNullPredicate, STRING_COLUMN) { + std::vector schema; + FieldInfo field_info; + SetFieldInfo(field_info, std::string("STRING_COLUMN"), OLAP_FIELD_TYPE_VARCHAR, + OLAP_FIELD_AGGREGATION_REPLACE, 1, false, true); + schema.push_back(field_info); + int size = 10; + std::vector return_columns; + for (int i = 0; i < schema.size(); ++i) { + return_columns.push_back(i); + } + InitVectorizedBatch(schema, return_columns, size); + ColumnVector* col_vector = _vectorized_batch->column(0); + + // for no nulls + col_vector->set_no_nulls(true); + StringValue* col_data = reinterpret_cast(_mem_pool->allocate(size * sizeof(StringValue))); + col_vector->set_col_data(col_data); + + char* string_buffer = reinterpret_cast(_mem_pool->allocate(55)); + for (int i = 0; i < size; ++i) { + for (int j = 0; j <= i; ++j) { + string_buffer[j] = 'a' + i; + } + (*(col_data + i)).len = i + 1; + (*(col_data + i)).ptr = string_buffer; + string_buffer += i + 1; + } + + ColumnPredicate* pred = new NullPredicate(0, true); + ASSERT_EQ(_vectorized_batch->size(), 10); + + // for has nulls + col_vector->set_no_nulls(false); + bool* is_null = reinterpret_cast(_mem_pool->allocate(size)); + memset(is_null, 0, size); + col_vector->set_is_null(is_null); + string_buffer = reinterpret_cast(_mem_pool->allocate(55)); + for (int i = 0; i < size; ++i) { + if (i % 3 == 0) { + is_null[i] = true; + } else { + for (int j = 0; j <= i; ++j) { + string_buffer[j] = 'a' + i; + } + (*(col_data + i)).len = i + 1; + (*(col_data + i)).ptr = string_buffer; + } + string_buffer += i + 1; + } + + _vectorized_batch->set_size(size); + _vectorized_batch->set_selected_in_use(false); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 4); +} + +TEST_F(TestNullPredicate, DATE_COLUMN) { + std::vector schema; + FieldInfo field_info; + SetFieldInfo(field_info, std::string("DATE_COLUMN"), OLAP_FIELD_TYPE_DATE, + OLAP_FIELD_AGGREGATION_REPLACE, 1, false, true); + schema.push_back(field_info); + int size = 6; + std::vector return_columns; + for (int i = 0; i < schema.size(); ++i) { + return_columns.push_back(i); + } + InitVectorizedBatch(schema, return_columns, size); + ColumnVector* col_vector = _vectorized_batch->column(0); + + // for no nulls + col_vector->set_no_nulls(true); + uint24_t* col_data = reinterpret_cast(_mem_pool->allocate(size * sizeof(uint24_t))); + col_vector->set_col_data(col_data); + + std::vector date_array; + date_array.push_back("2017-09-07"); + date_array.push_back("2017-09-08"); + date_array.push_back("2017-09-09"); + date_array.push_back("2017-09-10"); + date_array.push_back("2017-09-11"); + date_array.push_back("2017-09-12"); + for (int i = 0; i < size; ++i) { + uint24_t timestamp = datetime::to_date_timestamp(date_array[i].c_str()); + *(col_data + i) = timestamp; + } + + ColumnPredicate* pred = new NullPredicate(0, true); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 0); + + // for has nulls + col_vector->set_no_nulls(false); + bool* is_null = reinterpret_cast(_mem_pool->allocate(size)); + memset(is_null, 0, size); + col_vector->set_is_null(is_null); + for (int i = 0; i < size; ++i) { + if (i % 3 == 0) { + is_null[i] = true; + } else { + uint24_t timestamp = datetime::to_date_timestamp(date_array[i].c_str()); + *(col_data + i) = timestamp; + } + } + + _vectorized_batch->set_size(size); + _vectorized_batch->set_selected_in_use(false); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 2); +} + +TEST_F(TestNullPredicate, DATETIME_COLUMN) { + std::vector schema; + FieldInfo field_info; + SetFieldInfo(field_info, std::string("DATETIME_COLUMN"), OLAP_FIELD_TYPE_DATETIME, + OLAP_FIELD_AGGREGATION_REPLACE, 1, false, true); + schema.push_back(field_info); + int size = 6; + std::vector return_columns; + for (int i = 0; i < schema.size(); ++i) { + return_columns.push_back(i); + } + InitVectorizedBatch(schema, return_columns, size); + ColumnVector* col_vector = _vectorized_batch->column(0); + + // for no nulls + col_vector->set_no_nulls(true); + uint64_t* col_data = reinterpret_cast(_mem_pool->allocate(size * sizeof(uint64_t))); + col_vector->set_col_data(col_data); + + std::vector date_array; + date_array.push_back("2017-09-07 00:00:00"); + date_array.push_back("2017-09-08 00:01:00"); + date_array.push_back("2017-09-09 00:00:01"); + date_array.push_back("2017-09-10 01:00:00"); + date_array.push_back("2017-09-11 01:01:00"); + date_array.push_back("2017-09-12 01:01:01"); + for (int i = 0; i < size; ++i) { + uint64_t timestamp = datetime::to_datetime_timestamp(date_array[i].c_str()); + *(col_data + i) = timestamp; + } + + ColumnPredicate* pred = new NullPredicate(0, true); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 0); + + // for has nulls + col_vector->set_no_nulls(false); + bool* is_null = reinterpret_cast(_mem_pool->allocate(size)); + memset(is_null, 0, size); + col_vector->set_is_null(is_null); + for (int i = 0; i < size; ++i) { + if (i % 3 == 0) { + is_null[i] = true; + } else { + uint64_t timestamp = datetime::to_datetime_timestamp(date_array[i].c_str()); + *(col_data + i) = timestamp; + } + } + + _vectorized_batch->set_size(size); + _vectorized_batch->set_selected_in_use(false); + pred->evaluate(_vectorized_batch); + ASSERT_EQ(_vectorized_batch->size(), 2); +} + +} // namespace palo + +int main(int argc, char** argv) { + std::string conffile = std::string(getenv("PALO_HOME")) + "/conf/be.conf"; + if (!palo::config::init(conffile.c_str(), false)) { + fprintf(stderr, "error read config file. \n"); + return -1; + } + palo::init_glog("be-test"); + int ret = palo::OLAP_SUCCESS; + testing::InitGoogleTest(&argc, argv); + palo::CpuInfo::init(); + ret = RUN_ALL_TESTS(); + google::protobuf::ShutdownProtobufLibrary(); + return ret; +} diff --git a/be/test/olap/row_block_test.cpp b/be/test/olap/row_block_test.cpp index cccc3092bf..bf4e973143 100644 --- a/be/test/olap/row_block_test.cpp +++ b/be/test/olap/row_block_test.cpp @@ -31,702 +31,768 @@ using std::endl; using std::stringstream; namespace palo { -void set_field_info( - FieldInfo& info, - const char* name, - FieldType type, - FieldAggregationMethod agg, - uint32_t length, - bool is_key) { - info.name = string(name); - info.type = type; - info.aggregation = agg; - info.length = length; - info.is_key = is_key; -} - -static size_t g_row_length = 0; -static size_t g_row_length_mysql = 0; - -void make_error_schema_array(vector& schema_array) { - schema_array.clear(); - FieldInfo info; - set_field_info(info, "", OLAP_FIELD_TYPE_UNKNOWN, OLAP_FIELD_AGGREGATION_SUM, 0, true); - schema_array.push_back(info); -} - -// 生æˆä¸€ä»½FieldInfo array -void make_schema_array(vector& schema_array) { - schema_array.clear(); - FieldInfo info; - set_field_info(info, "", OLAP_FIELD_TYPE_INT, OLAP_FIELD_AGGREGATION_SUM, 4, true); - schema_array.push_back(info); - - set_field_info(info, "", OLAP_FIELD_TYPE_BIGINT, OLAP_FIELD_AGGREGATION_SUM, 8, true); - schema_array.push_back(info); - - set_field_info(info, "", OLAP_FIELD_TYPE_SMALLINT, OLAP_FIELD_AGGREGATION_SUM, 2, true); - schema_array.push_back(info); - - set_field_info(info, "", OLAP_FIELD_TYPE_UNSIGNED_INT, OLAP_FIELD_AGGREGATION_MAX, 4, true); - schema_array.push_back(info); - - set_field_info(info, "", OLAP_FIELD_TYPE_CHAR, OLAP_FIELD_AGGREGATION_REPLACE, 32, true); - schema_array.push_back(info); - - set_field_info(info, "", OLAP_FIELD_TYPE_FLOAT, OLAP_FIELD_AGGREGATION_SUM, 4, false); - schema_array.push_back(info); - - set_field_info(info, "", OLAP_FIELD_TYPE_CHAR, OLAP_FIELD_AGGREGATION_REPLACE, 64, false); - schema_array.push_back(info); - - set_field_info(info, "", OLAP_FIELD_TYPE_DATE, OLAP_FIELD_AGGREGATION_MAX, 4, false); - schema_array.push_back(info); - - set_field_info(info, "", OLAP_FIELD_TYPE_INT, OLAP_FIELD_AGGREGATION_SUM, 4, false); - schema_array.push_back(info); - - g_row_length = sizeof(int) + sizeof(int64_t) + sizeof(int16_t) + - sizeof(uint32_t) + 32 + sizeof(float) + 64 + sizeof(int) + sizeof(int); - - g_row_length_mysql = g_row_length - 1; -} - -// 生æˆä¸€ä»½short FieldInfo array -void make_schema_array_short(vector& schema_array) { - schema_array.clear(); - FieldInfo info; - set_field_info(info, "", OLAP_FIELD_TYPE_INT, OLAP_FIELD_AGGREGATION_SUM, 4, true); - schema_array.push_back(info); - - set_field_info(info, "", OLAP_FIELD_TYPE_BIGINT, OLAP_FIELD_AGGREGATION_SUM, 8, true); - schema_array.push_back(info); -} - -void make_string_array_short(vector& string_array) { - string_array.clear(); - string_array.push_back(string("123")); - string_array.push_back(string("456000")); -} - -// 按照上é¢çš„schema构造å„个column上的value -void make_string_array(vector& string_array) { - string_array.clear(); - string_array.push_back(string("123")); - string_array.push_back(string("456000")); - string_array.push_back(string("789")); - string_array.push_back(string("0")); - string_array.push_back(string("guping")); - string_array.push_back(string("123.0")); - string_array.push_back(string("olap")); - string_array.push_back(string("2012-5-8")); - string_array.push_back(string("100")); -} - -void make_string_array_0(vector& string_array) { - string_array.clear(); - string_array.push_back(string("123")); - string_array.push_back(string("456000")); - string_array.push_back(string("789")); - string_array.push_back(string("1")); - string_array.push_back(string("guping")); - string_array.push_back(string("123.0")); - string_array.push_back(string("olap")); - string_array.push_back(string("2012-5-8")); - string_array.push_back(string("100")); -} - -void make_string_array_1(vector& string_array) { - string_array.clear(); - string_array.push_back(string("123")); - string_array.push_back(string("456000")); - string_array.push_back(string("790")); - string_array.push_back(string("0")); - string_array.push_back(string("guping")); - string_array.push_back(string("123.0")); - string_array.push_back(string("olap")); - string_array.push_back(string("2012-5-8")); - string_array.push_back(string("100")); -} -void make_string_array_2(vector& string_array) { - string_array.clear(); - string_array.push_back(string("123")); - string_array.push_back(string("456001")); - string_array.push_back(string("789")); - string_array.push_back(string("0")); - string_array.push_back(string("guping")); - string_array.push_back(string("123.0")); - string_array.push_back(string("olap")); - string_array.push_back(string("2012-5-8")); - string_array.push_back(string("100")); -} -void make_string_array_3(vector& string_array) { - string_array.clear(); - string_array.push_back(string("124")); - string_array.push_back(string("456000")); - string_array.push_back(string("789")); - string_array.push_back(string("0")); - string_array.push_back(string("guping")); - string_array.push_back(string("123.0")); - string_array.push_back(string("olap")); - string_array.push_back(string("2012-5-8")); - string_array.push_back(string("100")); -} - -// 测试æˆå‘˜å‡½æ•°åœ¨åˆå§‹åŒ–ã€å‚数异常æ¡ä»¶ä¸‹çš„行为 -class TestRowBlockSimple : public testing::Test { -protected: - void SetUp() { - make_schema_array(_tablet_schema); - _block_info.checksum = 12345678; - _block_info.row_num = 8; - _block_info.unpacked_len = 0; - } - void TearDown() { - } - vector _tablet_schema; - RowBlockInfo _block_info; -}; - -// 测试æˆå‘˜init方法 -TEST_F(TestRowBlockSimple, test_init) { - RowBlock row_block(_tablet_schema); - ASSERT_EQ(OLAP_SUCCESS, row_block.init(_block_info)); - ASSERT_EQ(row_block._buf_len, g_row_length * 8); - ASSERT_EQ(OLAP_ERR_INIT_FAILED, row_block.init(_block_info)); - vector temp_schema = row_block.tablet_schema(); - ASSERT_EQ(temp_schema.size(), _tablet_schema.size()); -} - -// æµ‹è¯•å„æˆå‘˜å‡½æ•°åœ¨æœªåˆå§‹åŒ–状æ€ä¸‹çš„行为 -TEST_F(TestRowBlockSimple, test_not_init) { - RowBlock row_block(_tablet_schema); - ASSERT_EQ(OLAP_ERR_NOT_INITED, row_block.get_row_to_read(0, NULL)); - RowCursor cursor; - ASSERT_EQ(OLAP_ERR_NOT_INITED, row_block.find_row(cursor, true, NULL)); - ASSERT_EQ(OLAP_ERR_NOT_INITED, row_block.decompress(NULL, 0, OLAP_COMP_STORAGE)); - ASSERT_EQ(OLAP_ERR_NOT_INITED, row_block.set_row(0, cursor)); - ASSERT_EQ(OLAP_ERR_NOT_INITED, row_block.finalize(0)); -} - -// æµ‹è¯•å„æˆå‘˜å‡½æ•°åœ¨è¾“入傿•°é”™è¯¯æƒ…况下的行为 -TEST_F(TestRowBlockSimple, test_err_param) { - RowBlock row_block(_tablet_schema); - ASSERT_EQ(OLAP_SUCCESS, row_block.init(_block_info)); - RowCursor cursor; - - ASSERT_EQ(OLAP_ERR_INPUT_PARAMETER_ERROR, row_block.get_row_to_read(100, NULL)); - ASSERT_EQ(OLAP_ERR_INPUT_PARAMETER_ERROR, row_block.get_row_to_read(1, NULL)); - ASSERT_EQ(OLAP_ERR_INPUT_PARAMETER_ERROR, row_block.get_row_to_read(1, &cursor)); - - ASSERT_EQ(OLAP_ERR_INPUT_PARAMETER_ERROR, - row_block.find_row(cursor, false, NULL)); - - ASSERT_EQ(OLAP_ERR_INPUT_PARAMETER_ERROR, row_block.compress( - NULL, 0, NULL, OLAP_COMP_STORAGE)); - - ASSERT_EQ(OLAP_ERR_INPUT_PARAMETER_ERROR, row_block.decompress( - NULL, 0, OLAP_COMP_STORAGE)); - - ASSERT_EQ(OLAP_ERR_INPUT_PARAMETER_ERROR, row_block.set_row(100, cursor)); - ASSERT_EQ(OLAP_ERR_INPUT_PARAMETER_ERROR, row_block.set_row(1, cursor)); - - ASSERT_EQ(OLAP_ERR_INPUT_PARAMETER_ERROR, row_block.finalize(100)); -} - -TEST_F(TestRowBlockSimple, test_invalid_schema) { - vector err_schema; - make_error_schema_array(err_schema); - RowBlock err_block(err_schema); - ASSERT_EQ(OLAP_SUCCESS, err_block.init(_block_info)); - RowCursor cursor; - uint32_t row_index; - ASSERT_EQ(OLAP_ERR_INIT_FAILED, err_block.find_row(cursor, false, &row_index)); -} class TestRowBlock : public testing::Test { public: - TestRowBlock() : test_index(0) {} + TestRowBlock() {} void SetUp() { - //LOG(INFO) << "test_index is " << test_index++ << endl; - make_schema_array(_tablet_schema); - _block_info.checksum = 12345678; - _block_info.row_num = 8; - for (uint32_t i = 0; i < _tablet_schema.size(); i++) { - _block_info.unpacked_len += _tablet_schema[i].length; - } - _block_info.unpacked_len *= _block_info.row_num; - ASSERT_EQ(OLAP_SUCCESS, cursor.init(_tablet_schema)); - ASSERT_EQ(OLAP_SUCCESS, other_cursor.init(_tablet_schema)); - make_schema_array_short(_tablet_schema_short); - make_string_array_short(string_array_short); - ASSERT_EQ(OLAP_SUCCESS, short_cursor.init(_tablet_schema_short)); - ASSERT_EQ(OLAP_SUCCESS, short_cursor.from_string(string_array_short)); } void TearDown() { } - void SetRows(RowBlock& row_block) { - make_string_array_0(string_array); - ASSERT_EQ(OLAP_SUCCESS, other_cursor.from_string(string_array)); - ASSERT_EQ(OLAP_SUCCESS, row_block.set_row(0, other_cursor)); - - make_string_array_1(string_array); - ASSERT_EQ(OLAP_SUCCESS, other_cursor.from_string(string_array)); - ASSERT_EQ(OLAP_SUCCESS, row_block.set_row(1, other_cursor)); - - make_string_array_2(string_array); - ASSERT_EQ(OLAP_SUCCESS, other_cursor.from_string(string_array)); - ASSERT_EQ(OLAP_SUCCESS, row_block.set_row(2, other_cursor)); - - make_string_array_3(string_array); - ASSERT_EQ(OLAP_SUCCESS, other_cursor.from_string(string_array)); - ASSERT_EQ(OLAP_SUCCESS, row_block.set_row(3, other_cursor)); } - - RowCursor cursor; - RowCursor other_cursor; - RowCursor short_cursor; - size_t test_index; - RowBlockInfo _block_info; - vector _tablet_schema; - vector string_array; - vector _tablet_schema_short; - vector string_array_short; }; -TEST_F(TestRowBlock, test_set_row) { - RowBlock row_block(_tablet_schema); - ASSERT_EQ(OLAP_SUCCESS, row_block.init(_block_info)); - ASSERT_EQ(row_block._buf_len, g_row_length * 8); - SetRows(row_block); - - // å°è¯•用get_rowå–出一行æ¥è¿›è¡Œæ¯”较 - make_string_array_0(string_array); - ASSERT_EQ(OLAP_SUCCESS, cursor.from_string(string_array)); - ASSERT_EQ(OLAP_SUCCESS, row_block.get_row_to_read(0, &other_cursor)); - ASSERT_EQ(0, cursor.cmp(other_cursor)); - ASSERT_EQ(OLAP_SUCCESS, row_block.get_row_to_read(1, &other_cursor)); - ASSERT_EQ(-1, cursor.cmp(other_cursor)); - - // 测试finalize - ASSERT_EQ(OLAP_SUCCESS, row_block.finalize(4)); - ASSERT_EQ(size_t(4), row_block._info.row_num); - ASSERT_EQ(size_t(4 * g_row_length), row_block._used_buf_size); - - ASSERT_EQ(OLAP_SUCCESS, row_block.get_row_to_read(0, &other_cursor)); - ASSERT_EQ(0, cursor.cmp(other_cursor)); - - // 冿¬¡finalize,这次应该对存储内容没有改动 - ASSERT_EQ(OLAP_SUCCESS, row_block.finalize(4)); - ASSERT_EQ(size_t(4), row_block._info.row_num); - ASSERT_EQ(size_t(4 * g_row_length), row_block._used_buf_size); - - ASSERT_EQ(OLAP_SUCCESS, row_block.get_row_to_read(1, &other_cursor)); - ASSERT_EQ(-1, cursor.cmp(other_cursor)); - - ASSERT_EQ(OLAP_SUCCESS, row_block.get_row_to_read(0, &other_cursor)); - ASSERT_EQ(0, cursor.cmp(other_cursor)); - - // 测试find_row - uint32_t index = 0; - // 使用全key进行测试 - ASSERT_EQ(OLAP_SUCCESS, row_block.get_row_to_read(0, &other_cursor)); - ASSERT_EQ(OLAP_SUCCESS, row_block.find_row(other_cursor, false, &index)); - ASSERT_EQ(uint32_t(0), index); - ASSERT_EQ(OLAP_SUCCESS, row_block.find_row(other_cursor, true, &index)); - ASSERT_EQ(uint32_t(1), index); - - ASSERT_EQ(OLAP_SUCCESS, row_block.get_row_to_read(3, &other_cursor)); - ASSERT_EQ(OLAP_SUCCESS, row_block.find_row(other_cursor, false, &index)); - ASSERT_EQ(uint32_t(3), index); - - ASSERT_EQ(OLAP_SUCCESS, row_block.find_row(other_cursor, true, &index)); - ASSERT_EQ(uint32_t(4), index); - // 构造一个短key进行测试 - ASSERT_EQ(OLAP_SUCCESS, row_block.find_row(short_cursor, false, &index)); - ASSERT_EQ(uint32_t(0), index); - - ASSERT_EQ(OLAP_SUCCESS, row_block.find_row(short_cursor, true, &index)); - ASSERT_EQ(uint32_t(2), index); - - // 测试clear函数 - ASSERT_EQ(OLAP_SUCCESS, row_block.clear()); - ASSERT_EQ((size_t)8, row_block._info.row_num); - ASSERT_EQ(size_t(8 * g_row_length), row_block._buf_len); +TEST_F(TestRowBlock, init) { + std::vector fields; + { + // k1: bigint + { + FieldInfo info; + info.name = "k1"; + info.type = OLAP_FIELD_TYPE_BIGINT; + info.aggregation = OLAP_FIELD_AGGREGATION_NONE; + info.length = 8; + info.is_key = true; + fields.push_back(info); + } + // k2: char + { + FieldInfo info; + info.name = "k2"; + info.type = OLAP_FIELD_TYPE_CHAR; + info.aggregation = OLAP_FIELD_AGGREGATION_NONE; + info.length = 10; + info.is_key = true; + fields.push_back(info); + } + // k3: varchar + { + FieldInfo info; + info.name = "k3"; + info.type = OLAP_FIELD_TYPE_VARCHAR; + info.aggregation = OLAP_FIELD_AGGREGATION_NONE; + info.length = 20; + info.is_key = true; + fields.push_back(info); + } + } + { + // has nullbyte + RowBlock block(fields); + RowBlockInfo block_info; + block_info.row_num = 1024; + block_info.data_file_type = OLAP_DATA_FILE; + block_info.null_supported = true; + auto res = block.init(block_info); + ASSERT_EQ(OLAP_SUCCESS, res); + // num_rows * (num_nullbytes + bigint + char + varchar) + ASSERT_EQ(1024 * (3 + 8 + 10 + (4 + 20)), block.buf_len()); + } + { + // has nullbyte + RowBlock block(fields); + RowBlockInfo block_info; + block_info.row_num = 1024; + block_info.data_file_type = OLAP_DATA_FILE; + block_info.null_supported = false; + auto res = block.init(block_info); + ASSERT_EQ(OLAP_SUCCESS, res); + // num_rows * (num_nullbytes + bigint + char + varchar) + ASSERT_EQ(1024 * (8 + 10 + (4 + 20)), block.buf_len()); + } } -TEST_F(TestRowBlock, test_compress_decompress) { - RowBlock row_block(_tablet_schema); - _block_info.row_num = 4; - _block_info.unpacked_len = 0; - ASSERT_EQ(OLAP_SUCCESS, row_block.init(_block_info)); - ASSERT_EQ(row_block._buf_len, g_row_length * 4); - LOG(INFO) << "before compression, size is " << g_row_length * 4; - SetRows(row_block); - LOG(INFO) << "row block adler32 is " << row_block._info.checksum << endl; - // 必须先finalizeå†compress - ASSERT_EQ(OLAP_SUCCESS, row_block.finalize(4)); - LOG(INFO) << "row block adler32 is " << row_block._info.checksum << endl; - const size_t buf_len = 100 * 1024; - char* buf = new char[buf_len]; +TEST_F(TestRowBlock, write_and_read) { + std::vector fields; + { + // k1: bigint + { + FieldInfo info; + info.name = "k1"; + info.type = OLAP_FIELD_TYPE_BIGINT; + info.aggregation = OLAP_FIELD_AGGREGATION_NONE; + info.length = 8; + info.is_key = true; + fields.push_back(info); + } + // k2: char + { + FieldInfo info; + info.name = "k2"; + info.type = OLAP_FIELD_TYPE_CHAR; + info.aggregation = OLAP_FIELD_AGGREGATION_NONE; + info.length = 10; + info.is_key = true; + fields.push_back(info); + } + // k3: varchar + { + FieldInfo info; + info.name = "k3"; + info.type = OLAP_FIELD_TYPE_VARCHAR; + info.aggregation = OLAP_FIELD_AGGREGATION_NONE; + info.length = 20; + info.is_key = true; + fields.push_back(info); + } + } + // + RowBlock block(fields); + RowBlockInfo block_info; + block_info.row_num = 1024; + block_info.data_file_type = OLAP_DATA_FILE; + block_info.null_supported = true; + auto res = block.init(block_info); + ASSERT_EQ(OLAP_SUCCESS, res); + + RowCursor row; + row.init(fields); + for (int i = 0; i < 5; ++i) { + block.get_row(i, &row); + + // bigint + { + int64_t val = i; + row.set_not_null(0); + row.set_field_content(0, (const char*)&val, block.mem_pool()); + } + // char + { + char buf[10]; + memset(buf, 'a' + i, 10); + StringSlice val(buf, 10); + row.set_not_null(1); + row.set_field_content(1, (const char*)&val, block.mem_pool()); + } + // varchar + { + char buf[10]; + memset(buf, '0' + i, 10); + StringSlice val(buf, 10); + row.set_not_null(2); + row.set_field_content(2, (const char*)&val, block.mem_pool()); + } + } + block.finalize(5); + ASSERT_EQ(5, block.row_num()); + + char serialized_buf[2048]; size_t written_len = 0; - ASSERT_EQ(OLAP_SUCCESS, row_block.compress(buf, buf_len, &written_len, - OLAP_COMP_STORAGE)); - LOG(INFO) << "after compression, size is " << written_len << endl; + res = block.serialize_to_row_format(serialized_buf, 2048, &written_len, OLAP_COMP_STORAGE); + ASSERT_EQ(OLAP_SUCCESS, res); - uint32_t checksum = row_block._info.checksum; - LOG(INFO) << "row block adler32 is " << row_block._info.checksum << endl; + { + RowBlock resolve_block(fields); + block_info.checksum = block.row_block_info().checksum; + block_info.row_num = 5; + res = resolve_block.init(block_info); + ASSERT_EQ(OLAP_SUCCESS, res); - // 新建一个rowblock,然åŽä»Žåˆšæ‰åŽ‹å‡ºæ¥çš„æ•°æ®è§£åŽ‹èµ·æ¥ - RowBlock other_row_block(_tablet_schema); - _block_info.checksum = checksum; - _block_info.row_num = 4; - ASSERT_EQ(OLAP_SUCCESS, other_row_block.init(_block_info)); - ASSERT_EQ(other_row_block._buf_len, g_row_length * 4); + res = resolve_block.decompress(serialized_buf, written_len, OLAP_COMP_STORAGE); + ASSERT_EQ(OLAP_SUCCESS, res); - ASSERT_EQ(OLAP_SUCCESS, other_row_block.decompress(buf, - written_len, OLAP_COMP_STORAGE)); - // 在两个rowblocké‡Œé¢æŒ‰è¡Œæ¯”å¯¹æ•°æ® - for (size_t i = 0; i < 4; ++i) { - ASSERT_EQ(OLAP_SUCCESS, row_block.get_row_to_read(i, &cursor)); - ASSERT_EQ(OLAP_SUCCESS, other_row_block.get_row_to_read(i, &other_cursor)); - ASSERT_EQ(0, cursor.cmp(other_cursor)); - } - - // å†å»ºä¸€ä¸ªrow_block,从错误的buf中decompressæ•°æ®ï¼Œæµ‹è¯•异常场景, - // 分为三ç§å¼‚常场景:1. 待解压buf䏿­£ç¡® 2. 内部buflenä¸å¤Ÿå¤§ 3. checksum䏿­£ç¡® - buf[0]++; - ASSERT_EQ(OLAP_ERR_DECOMPRESS_ERROR, other_row_block.decompress(buf, - written_len, OLAP_COMP_STORAGE)); - buf[0]--; - - // 由于å˜é•¿å­—ç¬¦ä¸²çš„ç¼˜æ•…ï¼Œè¿™é‡Œæ²¡æ³•å†æ£€æµ‹æŽ¥å‡ºæ¥å­—符串ä¸å¤Ÿçš„问题 - // é checksumæ¥æ ¡éªŒå¥½äº†, 这里测下内部bufä¸è¶³çš„æƒ…况 - other_row_block._buf_len--; - ASSERT_EQ(OLAP_ERR_DECOMPRESS_ERROR, other_row_block.decompress(buf, - written_len, OLAP_COMP_STORAGE)); - other_row_block._buf_len++; - - other_row_block._info.checksum++; - ASSERT_EQ(OLAP_ERR_CHECKSUM_ERROR, other_row_block.decompress(buf, - written_len, OLAP_COMP_STORAGE)); - other_row_block._info.checksum--; - - delete[] buf; -} - -TEST_F(TestRowBlock, test_crc) { - unsigned int crc32 = 0xffffffff; - const char* buf1 = "abcdefg"; - size_t len1 = strlen(buf1); - unsigned int v1 = crc32c_lut(buf1, 0, len1, crc32); - LOG(INFO) << "crc32c_lut:" << v1 << endl; - ASSERT_EQ(v1, 433589182); - if (1 == check_sse4_2()) { - unsigned int v2 = baidu_crc32_qw(buf1, crc32, len1); - LOG(INFO) << "baidu_crc32_qw" << v2 <size); + ASSERT_EQ(0, memcmp(buf, slice->data, 10)); + } + { + ASSERT_FALSE(row.is_null(2)); + StringSlice* slice = (StringSlice*)row.get_field_content_ptr(2); + char buf[20]; + memset(buf, '0' + i, 10); + ASSERT_EQ(10, slice->size); + ASSERT_EQ(0, memcmp(buf, slice->data, 10)); + } + } } } -class TestRowBlockWithConjuncts : public testing::Test { -public: - TestRowBlockWithConjuncts() : - _object_pool(NULL), _runtime_state(NULL), _row_desc(NULL) { +TEST_F(TestRowBlock, write_and_read_without_nullbyte) { + std::vector fields; + { + // k1: bigint + { + FieldInfo info; + info.name = "k1"; + info.type = OLAP_FIELD_TYPE_BIGINT; + info.aggregation = OLAP_FIELD_AGGREGATION_NONE; + info.length = 8; + info.is_key = true; + fields.push_back(info); + } + // k2: char + { + FieldInfo info; + info.name = "k2"; + info.type = OLAP_FIELD_TYPE_CHAR; + info.aggregation = OLAP_FIELD_AGGREGATION_NONE; + info.length = 10; + info.is_key = true; + fields.push_back(info); + } + // k3: varchar + { + FieldInfo info; + info.name = "k3"; + info.type = OLAP_FIELD_TYPE_VARCHAR; + info.aggregation = OLAP_FIELD_AGGREGATION_NONE; + info.length = 20; + info.is_key = true; + fields.push_back(info); + } } + // + RowBlock block(fields); + RowBlockInfo block_info; + block_info.row_num = 1024; + block_info.data_file_type = OLAP_DATA_FILE; + block_info.null_supported = false; + auto res = block.init(block_info); + ASSERT_EQ(OLAP_SUCCESS, res); + + RowCursor row; + row.init(fields); + for (int i = 0; i < 5; ++i) { + block.get_row(i, &row); + + // bigint + { + int64_t val = i; + row.set_not_null(0); + row.set_field_content(0, (const char*)&val, block.mem_pool()); + } + // char + { + char buf[10]; + memset(buf, 'a' + i, 10); + StringSlice val(buf, 10); + row.set_not_null(1); + row.set_field_content(1, (const char*)&val, block.mem_pool()); + } + // varchar + { + char buf[10]; + memset(buf, '0' + i, 10); + StringSlice val(buf, 10); + row.set_not_null(2); + row.set_field_content(2, (const char*)&val, block.mem_pool()); + } + } + block.finalize(5); + ASSERT_EQ(5, block.row_num()); + + char serialized_buf[2048]; + size_t written_len = 0; + res = block.serialize_to_row_format(serialized_buf, 2048, &written_len, OLAP_COMP_STORAGE); + ASSERT_EQ(OLAP_SUCCESS, res); + + { + RowBlock resolve_block(fields); + block_info.checksum = block.row_block_info().checksum; + block_info.row_num = 5; + res = resolve_block.init(block_info); + ASSERT_EQ(OLAP_SUCCESS, res); + + res = resolve_block.decompress(serialized_buf, written_len, OLAP_COMP_STORAGE); + ASSERT_EQ(OLAP_SUCCESS, res); + + ASSERT_EQ(5, resolve_block.row_num()); + for (int i = 0; i < 5; ++i) { + resolve_block.get_row(i, &row); + { + ASSERT_FALSE(row.is_null(0)); + ASSERT_EQ(i, *(int64_t*)row.get_field_content_ptr(0)); + } + { + ASSERT_FALSE(row.is_null(1)); + StringSlice* slice = (StringSlice*)row.get_field_content_ptr(1); + char buf[10]; + memset(buf, 'a' + i, 10); + ASSERT_EQ(10, slice->size); + ASSERT_EQ(0, memcmp(buf, slice->data, 10)); + } + { + ASSERT_FALSE(row.is_null(2)); + StringSlice* slice = (StringSlice*)row.get_field_content_ptr(2); + char buf[20]; + memset(buf, '0' + i, 10); + ASSERT_EQ(10, slice->size); + ASSERT_EQ(0, memcmp(buf, slice->data, 10)); + } + } + } +} + +TEST_F(TestRowBlock, compress_failed) { + std::vector fields; + { + // k1: bigint + { + FieldInfo info; + info.name = "k1"; + info.type = OLAP_FIELD_TYPE_BIGINT; + info.aggregation = OLAP_FIELD_AGGREGATION_NONE; + info.length = 8; + info.is_key = true; + fields.push_back(info); + } + // k2: char + { + FieldInfo info; + info.name = "k2"; + info.type = OLAP_FIELD_TYPE_CHAR; + info.aggregation = OLAP_FIELD_AGGREGATION_NONE; + info.length = 10; + info.is_key = true; + fields.push_back(info); + } + // k3: varchar + { + FieldInfo info; + info.name = "k3"; + info.type = OLAP_FIELD_TYPE_VARCHAR; + info.aggregation = OLAP_FIELD_AGGREGATION_NONE; + info.length = 20; + info.is_key = true; + fields.push_back(info); + } + } + // + RowBlock block(fields); + RowBlockInfo block_info; + block_info.row_num = 1024; + block_info.data_file_type = OLAP_DATA_FILE; + block_info.null_supported = true; + auto res = block.init(block_info); + ASSERT_EQ(OLAP_SUCCESS, res); + + RowCursor row; + row.init(fields); + for (int i = 0; i < 5; ++i) { + block.get_row(i, &row); + + // bigint + { + int64_t val = i; + row.set_field_content(0, (const char*)&val, block.mem_pool()); + } + // char + { + char buf[10]; + memset(buf, 'a' + i, 10); + StringSlice val(buf, 10); + row.set_field_content(1, (const char*)&val, block.mem_pool()); + } + // varchar + { + char buf[10]; + memset(buf, '0' + i, 10); + StringSlice val(buf, 10); + row.set_field_content(2, (const char*)&val, block.mem_pool()); + } + } + block.finalize(5); + ASSERT_EQ(5, block.row_num()); + + char serialized_buf[2048]; + size_t written_len = 0; + res = block.serialize_to_row_format(serialized_buf, 1, &written_len, OLAP_COMP_STORAGE); + ASSERT_NE(OLAP_SUCCESS, res); +} + +TEST_F(TestRowBlock, decompress_failed) { + std::vector fields; + { + // k1: bigint + { + FieldInfo info; + info.name = "k1"; + info.type = OLAP_FIELD_TYPE_BIGINT; + info.aggregation = OLAP_FIELD_AGGREGATION_NONE; + info.length = 8; + info.is_key = true; + fields.push_back(info); + } + // k2: char + { + FieldInfo info; + info.name = "k2"; + info.type = OLAP_FIELD_TYPE_CHAR; + info.aggregation = OLAP_FIELD_AGGREGATION_NONE; + info.length = 10; + info.is_key = true; + fields.push_back(info); + } + // k3: varchar + { + FieldInfo info; + info.name = "k3"; + info.type = OLAP_FIELD_TYPE_VARCHAR; + info.aggregation = OLAP_FIELD_AGGREGATION_NONE; + info.length = 20; + info.is_key = true; + fields.push_back(info); + } + } + // + RowBlock block(fields); + RowBlockInfo block_info; + block_info.row_num = 1024; + block_info.data_file_type = OLAP_DATA_FILE; + block_info.null_supported = true; + auto res = block.init(block_info); + ASSERT_EQ(OLAP_SUCCESS, res); + + RowCursor row; + row.init(fields); + for (int i = 0; i < 5; ++i) { + block.get_row(i, &row); + + // bigint + { + int64_t val = i; + row.set_field_content(0, (const char*)&val, block.mem_pool()); + } + // char + { + char buf[10]; + memset(buf, 'a' + i, 10); + StringSlice val(buf, 10); + row.set_field_content(1, (const char*)&val, block.mem_pool()); + } + // varchar + { + char buf[10]; + memset(buf, '0' + i, 10); + StringSlice val(buf, 10); + row.set_field_content(2, (const char*)&val, block.mem_pool()); + } + } + block.finalize(5); + ASSERT_EQ(5, block.row_num()); + + char serialized_buf[2048]; + size_t written_len = 0; + res = block.serialize_to_row_format(serialized_buf, 2048, &written_len, OLAP_COMP_STORAGE); + ASSERT_EQ(OLAP_SUCCESS, res); + + { + // checksum failed + RowBlock resolve_block(fields); + block_info.checksum = 0; + block_info.row_num = 5; + res = resolve_block.init(block_info); + ASSERT_EQ(OLAP_SUCCESS, res); + + res = resolve_block.decompress(serialized_buf, written_len, OLAP_COMP_STORAGE); + ASSERT_NE(OLAP_SUCCESS, res); + } + { + // buffer is not ok + RowBlock resolve_block(fields); + block_info.checksum = block.row_block_info().checksum; + block_info.row_num = 5; + res = resolve_block.init(block_info); + ASSERT_EQ(OLAP_SUCCESS, res); + + res = resolve_block.decompress(serialized_buf, written_len - 1, OLAP_COMP_STORAGE); + ASSERT_NE(OLAP_SUCCESS, res); + } +} + +TEST_F(TestRowBlock, find_row) { + std::vector fields; + { + // k1: bigint + { + FieldInfo info; + info.name = "k1"; + info.type = OLAP_FIELD_TYPE_BIGINT; + info.aggregation = OLAP_FIELD_AGGREGATION_NONE; + info.length = 8; + info.is_key = true; + fields.push_back(info); + } + // k2: char + { + FieldInfo info; + info.name = "k2"; + info.type = OLAP_FIELD_TYPE_CHAR; + info.aggregation = OLAP_FIELD_AGGREGATION_NONE; + info.length = 10; + info.is_key = true; + fields.push_back(info); + } + // k3: varchar + { + FieldInfo info; + info.name = "k3"; + info.type = OLAP_FIELD_TYPE_VARCHAR; + info.aggregation = OLAP_FIELD_AGGREGATION_NONE; + info.length = 20; + info.is_key = true; + fields.push_back(info); + } + } + // + RowBlock block(fields); + RowBlockInfo block_info; + block_info.row_num = 1024; + block_info.data_file_type = OLAP_DATA_FILE; + block_info.null_supported = true; + auto res = block.init(block_info); + ASSERT_EQ(OLAP_SUCCESS, res); + + RowCursor row; + row.init(fields); + for (int i = 0; i < 5; ++i) { + block.get_row(i, &row); + + // bigint + { + int64_t val = i; + row.set_not_null(0); + row.set_field_content(0, (const char*)&val, block.mem_pool()); + } + // char + { + char buf[10]; + memset(buf, 'a' + i, 10); + StringSlice val(buf, 10); + row.set_not_null(1); + row.set_field_content(1, (const char*)&val, block.mem_pool()); + } + // varchar + { + char buf[10]; + memset(buf, '0' + i, 10); + StringSlice val(buf, 10); + row.set_not_null(2); + row.set_field_content(2, (const char*)&val, block.mem_pool()); + } + } + block.finalize(5); + ASSERT_EQ(5, block.row_num()); - void SetUp() { - _object_pool = new ObjectPool(); - _runtime_state = _object_pool->add(new RuntimeState("")); + { + RowCursor find_row; + find_row.init(fields); + for (int i = 0; i < 5; ++i) { + // bigint + { + int64_t val = i; + find_row.set_not_null(0); + find_row.set_field_content(0, (const char*)&val, block.mem_pool()); + } + // char + { + char buf[10]; + memset(buf, 'a' + i, 10); + StringSlice val(buf, 10); + find_row.set_not_null(1); + find_row.set_field_content(1, (const char*)&val, block.mem_pool()); + } + // varchar + { + char buf[10]; + memset(buf, '0' + i, 10); + StringSlice val(buf, 10); + find_row.set_not_null(2); + find_row.set_field_content(2, (const char*)&val, block.mem_pool()); + } + uint32_t row_index; + res = block.find_row(find_row, false, &row_index); + ASSERT_EQ(OLAP_SUCCESS, res); + ASSERT_EQ(i, row_index); - TDescriptorTable ttbl; - TTupleDescriptor tuple_desc; - tuple_desc.__set_id(0); - tuple_desc.__set_byteSize(8); - tuple_desc.__set_numNullBytes(0); - ttbl.tupleDescriptors.push_back(tuple_desc); - - { - TSlotDescriptor slot_desc; - - slot_desc.__set_id(0); - slot_desc.__set_parent(0); - // slot_desc.__set_slotType(TPrimitiveType::INT); - slot_desc.__set_slotType(gen_type_desc(TPrimitiveType::INT)); - slot_desc.__set_columnPos(0); - slot_desc.__set_byteOffset(4); - slot_desc.__set_nullIndicatorByte(0); - slot_desc.__set_nullIndicatorBit(0); - slot_desc.__set_colName("col1"); - slot_desc.__set_slotIdx(0); - slot_desc.__set_isMaterialized(true); - ttbl.slotDescriptors.push_back(slot_desc); - } - - { - TSlotDescriptor slot_desc; - slot_desc.__set_id(1); - slot_desc.__set_parent(0); - slot_desc.__set_slotType(gen_type_desc(TPrimitiveType::BIGINT)); - slot_desc.__set_columnPos(1); - slot_desc.__set_byteOffset(8); - slot_desc.__set_nullIndicatorByte(0); - slot_desc.__set_nullIndicatorBit(0); - slot_desc.__set_colName("col2"); - slot_desc.__set_slotIdx(1); - slot_desc.__set_isMaterialized(true); - ttbl.slotDescriptors.push_back(slot_desc); - } - - DescriptorTbl* desc_tbl = NULL; - ASSERT_TRUE(DescriptorTbl::create(_object_pool, ttbl, &desc_tbl).ok()); - ASSERT_TRUE(desc_tbl != NULL); - _runtime_state->set_desc_tbl(desc_tbl); - - std::vector row_tuples; - row_tuples.push_back(0); - std::vector nullable_tuples; - nullable_tuples.push_back(false); - _row_desc = _object_pool->add( - new RowDescriptor(*desc_tbl, row_tuples, nullable_tuples)); - - make_schema_array(_tablet_schema); - _block_info.checksum = 12345678; - _block_info.row_num = 128; - for (uint32_t i = 0; i < _tablet_schema.size(); i++) { - _block_info.unpacked_len += _tablet_schema[i].length; - } - _block_info.unpacked_len *= _block_info.row_num; - ASSERT_EQ(OLAP_SUCCESS, cursor.init(_tablet_schema)); - ASSERT_EQ(OLAP_SUCCESS, other_cursor.init(_tablet_schema)); - } - - void TearDown() { - if (_object_pool != NULL) { - delete _object_pool; - _object_pool = NULL; - } - } - - Expr* create_expr(int value) { - TExpr exprs; - { - TExprNode expr_node; - expr_node.__set_node_type(TExprNodeType::BINARY_PRED); - // TColumnType type; - // type.__set_type(TPrimitiveType::INT); - expr_node.__set_type(gen_type_desc(TPrimitiveType::INT)); - expr_node.__set_num_children(2); - expr_node.__isset.opcode = true; - // expr_node.__set_opcode(TExprOpcode::LT_INT_INT); - expr_node.__isset.vector_opcode = true; - // expr_node.__set_vector_opcode( - // TExprOpcode::FILTER_LT_INT_INT); - exprs.nodes.push_back(expr_node); + res = block.find_row(find_row, true, &row_index); + ASSERT_EQ(OLAP_SUCCESS, res); + ASSERT_EQ(i + 1, row_index); } { - TExprNode expr_node; - expr_node.__set_node_type(TExprNodeType::SLOT_REF); - // TColumnType type; - // type.__set_type(TPrimitiveType::INT); - expr_node.__set_type(gen_type_desc(TPrimitiveType::INT)); - expr_node.__set_num_children(0); - expr_node.__isset.slot_ref = true; - TSlotRef slot_ref; - slot_ref.__set_slot_id(0); - slot_ref.__set_tuple_id(0); - expr_node.__set_slot_ref(slot_ref); - expr_node.__isset.output_column = true; - expr_node.__set_output_column(0); - exprs.nodes.push_back(expr_node); + // bigint + { + int64_t val = 1; + find_row.set_field_content(0, (const char*)&val, block.mem_pool()); + } + // char + { + char buf[10]; + memset(buf, 'c', 9); + StringSlice val(buf, 9); + find_row.set_field_content(1, (const char*)&val, block.mem_pool()); + } + // varchar + { + char buf[10]; + memset(buf, '0', 10); + StringSlice val(buf, 10); + find_row.set_field_content(2, (const char*)&val, block.mem_pool()); + } + uint32_t row_index; + res = block.find_row(find_row, true, &row_index); + ASSERT_EQ(OLAP_SUCCESS, res); + ASSERT_EQ(2, row_index); } - { - TExprNode expr_node; - expr_node.__set_node_type(TExprNodeType::INT_LITERAL); - //TColumnType type; - //type.__set_type(TPrimitiveType::INT); - expr_node.__set_type(gen_type_desc(TPrimitiveType::INT)); - expr_node.__set_num_children(0); - expr_node.__isset.int_literal = true; - TIntLiteral int_literal; - int_literal.__set_value(value); - expr_node.__set_int_literal(int_literal); - exprs.nodes.push_back(expr_node); + // bigint + { + int64_t val = -1; + find_row.set_field_content(0, (const char*)&val, block.mem_pool()); + } + // char + { + char buf[10]; + memset(buf, 'c', 9); + StringSlice val(buf, 9); + find_row.set_field_content(1, (const char*)&val, block.mem_pool()); + } + // varchar + { + char buf[10]; + memset(buf, '0', 10); + StringSlice val(buf, 10); + find_row.set_field_content(2, (const char*)&val, block.mem_pool()); + } + uint32_t row_index; + res = block.find_row(find_row, true, &row_index); + ASSERT_EQ(OLAP_SUCCESS, res); + ASSERT_EQ(0, row_index); } - - //Expr* root_expr = NULL; - ExprContext* expr_content = NULL; - if (Expr::create_expr_tree(_object_pool, exprs, &expr_content).ok()) { - return expr_content->root(); - } else { - return NULL; - } - } - - void make_schema_array(vector& schema_array) { - schema_array.clear(); - FieldInfo info; - set_field_info(info, "", OLAP_FIELD_TYPE_INT, OLAP_FIELD_AGGREGATION_NONE, 4, true); - schema_array.push_back(info); - set_field_info(info, "", OLAP_FIELD_TYPE_BIGINT, OLAP_FIELD_AGGREGATION_SUM, 8, false); - schema_array.push_back(info); - - g_row_length = sizeof(int) + sizeof(int64_t); - g_row_length_mysql = g_row_length - 1; - } - - void set_rows(RowBlock& row_block) { - vector string_array; - for (int i = 0; i < _block_info.row_num; ++i) { - make_row(string_array, i); - ASSERT_EQ(OLAP_SUCCESS, other_cursor.from_string(string_array)); - ASSERT_EQ(OLAP_SUCCESS, row_block.set_row(i, other_cursor)); - } - } - - void make_row(vector& string_array, int value) { - string_array.clear(); - stringstream s; - s << value; - string_array.push_back(s.str()); - string_array.push_back(s.str()); - } - - RowCursor cursor; - RowCursor other_cursor; - RowCursor short_cursor; - size_t test_index; - RowBlockInfo _block_info; - vector _tablet_schema; - vector _tablet_schema_short; - vector string_array_short; -private: - ObjectPool* _object_pool; - RuntimeState* _runtime_state; - RowDescriptor* _row_desc; -}; - -TEST_F(TestRowBlockWithConjuncts, simpleTest) { - RowBlock row_block(_tablet_schema); - ASSERT_EQ(OLAP_SUCCESS, row_block.init(_block_info)); - ASSERT_EQ(row_block._buf_len, g_row_length * _block_info.row_num); - set_rows(row_block); - - vector string_array; - for (int i = 0; i < _block_info.row_num; ++i) { - make_row(string_array, i); - ASSERT_EQ(OLAP_SUCCESS, cursor.from_string(string_array)); - ASSERT_EQ(OLAP_SUCCESS, row_block.get_row_to_read(i, &other_cursor)); - ASSERT_EQ(0, cursor.cmp(other_cursor)); - } - - Expr* expr = create_expr(_block_info.row_num + 1); - ExprContent *expr_content = new ExprContent(expr); - ASSERT_TRUE(expr != NULL); - ASSERT_TRUE(expr->prepare(_runtime_state, *_row_desc, expr_content).ok()); - - ASSERT_EQ(OLAP_SUCCESS, row_block.eval_conjuncts(expr_content)); - ASSERT_EQ(_block_info.row_num, row_block.row_num()); - - for (int i = 0; i < row_block.row_num(); ++i) { - make_row(string_array, i); - ASSERT_EQ(OLAP_SUCCESS, cursor.from_string(string_array)); - ASSERT_EQ(OLAP_SUCCESS, row_block.get_row_to_read(i, &other_cursor)); - ASSERT_EQ(0, cursor.cmp(other_cursor)); } } -TEST_F(TestRowBlockWithConjuncts, simpleEvalTest) { - RowBlock row_block(_tablet_schema); - ASSERT_EQ(OLAP_SUCCESS, row_block.init(_block_info)); - set_rows(row_block); - - Expr* expr = create_expr(_block_info.row_num / 2); - ExprContent *expr_content = new ExprContext(expr); - ASSERT_TRUE(expr != NULL); - ASSERT_TRUE(expr->prepare(_runtime_state, *_row_desc, expr_content).ok()); - - // vector conjuncts; - // conjuncts.push_back(expr_content); - ASSERT_EQ(OLAP_SUCCESS, row_block.eval_conjuncts(expr_content)); - ASSERT_EQ(_block_info.row_num / 2, row_block.row_num()); - - vector string_array; - for (int i = 0; i < row_block.row_num(); ++i) { - make_row(string_array, i); - ASSERT_EQ(OLAP_SUCCESS, cursor.from_string(string_array)); - ASSERT_EQ(OLAP_SUCCESS, row_block.get_row_to_read(i, &other_cursor)); - ASSERT_EQ(0, cursor.cmp(other_cursor)); +TEST_F(TestRowBlock, clear) { + std::vector fields; + { + // k1: bigint + { + FieldInfo info; + info.name = "k1"; + info.type = OLAP_FIELD_TYPE_BIGINT; + info.aggregation = OLAP_FIELD_AGGREGATION_NONE; + info.length = 8; + info.is_key = true; + fields.push_back(info); + } + // k2: char + { + FieldInfo info; + info.name = "k2"; + info.type = OLAP_FIELD_TYPE_CHAR; + info.aggregation = OLAP_FIELD_AGGREGATION_NONE; + info.length = 10; + info.is_key = true; + fields.push_back(info); + } + // k3: varchar + { + FieldInfo info; + info.name = "k3"; + info.type = OLAP_FIELD_TYPE_VARCHAR; + info.aggregation = OLAP_FIELD_AGGREGATION_NONE; + info.length = 20; + info.is_key = true; + fields.push_back(info); + } } + // + RowBlock block(fields); + RowBlockInfo block_info; + block_info.row_num = 1024; + block_info.data_file_type = OLAP_DATA_FILE; + block_info.null_supported = true; + auto res = block.init(block_info); + ASSERT_EQ(OLAP_SUCCESS, res); + + block.finalize(5); + ASSERT_EQ(5, block.row_num()); + ASSERT_EQ(1024, block.capacity()); + block.clear(); + ASSERT_EQ(1024, block.row_num()); } -TEST_F(TestRowBlockWithConjuncts, RestoreTest) { - RowBlock row_block(_tablet_schema); - ASSERT_EQ(OLAP_SUCCESS, row_block.init(_block_info)); - set_rows(row_block); - ASSERT_EQ(_block_info.row_num, row_block.row_num()); - - ASSERT_EQ(OLAP_ERR_INPUT_PARAMETER_ERROR, row_block.backup()); - - Expr* expr = create_expr(_block_info.row_num / 2); - ASSERT_TRUE(expr != NULL); - ASSERT_TRUE(expr->prepare(_runtime_state, *_row_desc).ok()); - - vector conjuncts; - conjuncts.push_back(expr); - ASSERT_EQ(OLAP_SUCCESS, row_block.eval_conjuncts(conjuncts)); - ASSERT_EQ(_block_info.row_num / 2, row_block.row_num()); - - vector string_array; - for (int i = 0; i < row_block.row_num(); ++i) { - make_row(string_array, i); - ASSERT_EQ(OLAP_SUCCESS, cursor.from_string(string_array)); - ASSERT_EQ(OLAP_SUCCESS, row_block.get_row_to_read(i, &other_cursor)); - ASSERT_EQ(0, cursor.cmp(other_cursor)); +TEST_F(TestRowBlock, pos_limit) { + std::vector fields; + { + // k1: bigint + { + FieldInfo info; + info.name = "k1"; + info.type = OLAP_FIELD_TYPE_BIGINT; + info.aggregation = OLAP_FIELD_AGGREGATION_NONE; + info.length = 8; + info.is_key = true; + fields.push_back(info); + } + // k2: char + { + FieldInfo info; + info.name = "k2"; + info.type = OLAP_FIELD_TYPE_CHAR; + info.aggregation = OLAP_FIELD_AGGREGATION_NONE; + info.length = 10; + info.is_key = true; + fields.push_back(info); + } + // k3: varchar + { + FieldInfo info; + info.name = "k3"; + info.type = OLAP_FIELD_TYPE_VARCHAR; + info.aggregation = OLAP_FIELD_AGGREGATION_NONE; + info.length = 20; + info.is_key = true; + fields.push_back(info); + } } + // + RowBlock block(fields); + RowBlockInfo block_info; + block_info.row_num = 1024; + block_info.data_file_type = OLAP_DATA_FILE; + block_info.null_supported = true; + auto res = block.init(block_info); + ASSERT_EQ(OLAP_SUCCESS, res); - ASSERT_EQ(OLAP_SUCCESS, row_block.backup()); + // assert init value + ASSERT_EQ(0, block.pos()); + ASSERT_EQ(0, block.limit()); + ASSERT_FALSE(block.has_remaining()); + ASSERT_EQ(DEL_PARTIAL_SATISFIED, block.block_status()); - expr = create_expr(_block_info.row_num / 4); - ExprContent *expr_content = new ExprContent(expr); - ASSERT_TRUE(expr != NULL); - ASSERT_TRUE(expr->prepare(_runtime_state, *_row_desc, expr_content).ok()); + block.set_limit(100); + ASSERT_EQ(100, block.limit()); + ASSERT_TRUE(block.has_remaining()); + ASSERT_EQ(100, block.remaining()); - //conjuncts.push_back(expr); - ASSERT_EQ(OLAP_SUCCESS, row_block.eval_conjuncts(expr_content)); - ASSERT_EQ(_block_info.row_num / 4, row_block.row_num()); + block.set_pos(2); + ASSERT_TRUE(block.has_remaining()); + ASSERT_EQ(98, block.remaining()); - for (int i = 0; i < row_block.row_num(); ++i) { - make_row(string_array, i); - ASSERT_EQ(OLAP_SUCCESS, cursor.from_string(string_array)); - ASSERT_EQ(OLAP_SUCCESS, row_block.get_row_to_read(i, &other_cursor)); - ASSERT_EQ(0, cursor.cmp(other_cursor)); - } + block.pos_inc(); + ASSERT_TRUE(block.has_remaining()); + ASSERT_EQ(97, block.remaining()); - ASSERT_EQ(OLAP_SUCCESS, row_block.restore()); - ASSERT_EQ(_block_info.row_num / 2, row_block.row_num()); - - for (int i = 0; i < row_block.row_num(); ++i) { - make_row(string_array, i); - ASSERT_EQ(OLAP_SUCCESS, cursor.from_string(string_array)); - ASSERT_EQ(OLAP_SUCCESS, row_block.get_row_to_read(i, &other_cursor)); - ASSERT_EQ(0, cursor.cmp(other_cursor)); - } + block.set_block_status(DEL_SATISFIED); + ASSERT_EQ(DEL_SATISFIED, block.block_status()); } - } // @brief Test Stub -int main( int argc, char** argv ) { +int main(int argc, char** argv) { std::string conffile = std::string(getenv("PALO_HOME")) + "/conf/be.conf"; if (!palo::config::init(conffile.c_str(), false)) { fprintf(stderr, "error read config file. \n"); @@ -736,6 +802,6 @@ int main( int argc, char** argv ) { int ret = palo::OLAP_SUCCESS; testing::InitGoogleTest(&argc, argv); ret = RUN_ALL_TESTS(); - google::protobuf::ShutdownProtobufLibrary(); return ret; } + diff --git a/be/test/olap/row_cursor_test.cpp b/be/test/olap/row_cursor_test.cpp new file mode 100644 index 0000000000..8a75d5d9fc --- /dev/null +++ b/be/test/olap/row_cursor_test.cpp @@ -0,0 +1,583 @@ +// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "olap/row_cursor.h" +#include "runtime/mem_tracker.h" +#include "runtime/mem_pool.h" +#include "util/logging.h" + +namespace palo { + +void set_tablet_schema_for_init(std::vector* tablet_schema) { + FieldInfo k1; + k1.name = "k1"; + k1.type = OLAP_FIELD_TYPE_TINYINT; + k1.length = 1; + k1.is_key = true; + k1.index_length = 1; + k1.is_allow_null = true; + tablet_schema->push_back(k1); + + FieldInfo k2; + k2.name = "k2"; + k2.type = OLAP_FIELD_TYPE_SMALLINT; + k2.length = 2; + k2.default_value = "0"; + k2.is_key = true; + k2.index_length = 2; + k2.is_allow_null = true; + tablet_schema->push_back(k2); + + FieldInfo k3; + k3.name = "k3"; + k3.type = OLAP_FIELD_TYPE_INT; + k3.length = 4; + k3.is_key = true; + k3.index_length = 4; + k3.is_allow_null = true; + tablet_schema->push_back(k3); + + FieldInfo k4; + k4.name = "k4"; + k4.type = OLAP_FIELD_TYPE_DATE; + k4.length = 3; + k4.is_key = true; + k4.index_length = 3; + k4.is_allow_null = true; + tablet_schema->push_back(k4); + + FieldInfo k5; + k5.name = "k5"; + k5.type = OLAP_FIELD_TYPE_DATETIME; + k5.length = 8; + k5.is_key = true; + k5.index_length = 8; + k5.is_allow_null = true; + tablet_schema->push_back(k5); + + FieldInfo k6; + k6.name = "k6"; + k6.type = OLAP_FIELD_TYPE_DECIMAL; + k6.length = 12; + k6.precision = 6; + k6.frac = 3; + k6.is_key = true; + k6.index_length = 12; + k6.is_allow_null = true; + tablet_schema->push_back(k6); + + FieldInfo k7; + k7.name = "k7"; + k7.type = OLAP_FIELD_TYPE_CHAR; + k7.length = 4; + k7.default_value = "char"; + k7.is_key = true; + k7.index_length = 4; + k7.is_allow_null = true; + tablet_schema->push_back(k7); + + FieldInfo v1; + v1.name = "v1"; + v1.type = OLAP_FIELD_TYPE_BIGINT; + v1.length = 8; + v1.aggregation = OLAP_FIELD_AGGREGATION_SUM; + v1.is_key = false; + v1.is_allow_null = true; + tablet_schema->push_back(v1); + + FieldInfo v2; + v2.name = "v2"; + v2.type = OLAP_FIELD_TYPE_VARCHAR; + v2.length = 16 + OLAP_STRING_MAX_BYTES; + v2.aggregation = OLAP_FIELD_AGGREGATION_REPLACE; + v2.is_key = false; + v2.is_allow_null = true; + tablet_schema->push_back(v2); + + FieldInfo v3; + v3.name = "v3"; + v3.type = OLAP_FIELD_TYPE_LARGEINT; + v3.length = 16; + v3.aggregation = OLAP_FIELD_AGGREGATION_MAX; + v3.is_key = false; + v3.is_allow_null = true; + tablet_schema->push_back(v3); + + FieldInfo v4; + v4.name = "v4"; + v4.type = OLAP_FIELD_TYPE_DECIMAL; + v4.length = 12; + v4.aggregation = OLAP_FIELD_AGGREGATION_MIN; + v4.is_key = false; + v4.is_allow_null = true; + tablet_schema->push_back(v4); + + FieldInfo v5; + v5.name = "v5"; + v5.type = OLAP_FIELD_TYPE_HLL; + v5.length = HLL_COLUMN_DEFAULT_LEN; + v5.aggregation = OLAP_FIELD_AGGREGATION_HLL_UNION; + v5.is_key = false; + v5.is_allow_null = true; + tablet_schema->push_back(v5); +} + +void set_tablet_schema_for_scan_key(std::vector* tablet_schema) { + FieldInfo k1; + k1.name = "k1"; + k1.type = OLAP_FIELD_TYPE_CHAR; + k1.length = 4; + k1.index_length = 4; + k1.default_value = "char"; + k1.is_key = true; + k1.is_allow_null = true; + tablet_schema->push_back(k1); + + FieldInfo k2; + k2.name = "k2"; + k2.type = OLAP_FIELD_TYPE_VARCHAR; + k2.length = 16 + OLAP_STRING_MAX_BYTES; + k2.index_length = 20; + k2.is_key = true; + k2.is_allow_null = true; + tablet_schema->push_back(k2); + + FieldInfo v1; + v1.name = "v1"; + v1.type = OLAP_FIELD_TYPE_LARGEINT; + v1.length = 16; + v1.aggregation = OLAP_FIELD_AGGREGATION_MAX; + v1.is_key = false; + v1.is_allow_null = true; + tablet_schema->push_back(v1); + + FieldInfo v2; + v2.name = "v2"; + v2.type = OLAP_FIELD_TYPE_DECIMAL; + v2.length = 12; + v2.aggregation = OLAP_FIELD_AGGREGATION_MIN; + v2.is_key = false; + v2.is_allow_null = true; + tablet_schema->push_back(v2); +} + +void set_tablet_schema_for_cmp_and_aggregate(std::vector* tablet_schema) { + FieldInfo k1; + k1.name = "k1"; + k1.type = OLAP_FIELD_TYPE_CHAR; + k1.length = 4; + k1.default_value = "char"; + k1.is_key = true; + k1.index_length = 4; + k1.is_allow_null = true; + tablet_schema->push_back(k1); + + FieldInfo k2; + k2.name = "k2"; + k2.type = OLAP_FIELD_TYPE_INT; + k2.length = 4; + k2.is_key = true; + k2.index_length = 4; + k2.is_allow_null = true; + tablet_schema->push_back(k2); + + FieldInfo v1; + v1.name = "v1"; + v1.type = OLAP_FIELD_TYPE_LARGEINT; + v1.length = 16; + v1.aggregation = OLAP_FIELD_AGGREGATION_SUM; + v1.is_key = false; + v1.is_allow_null = true; + tablet_schema->push_back(v1); + + FieldInfo v2; + v2.name = "v2"; + v2.type = OLAP_FIELD_TYPE_DOUBLE; + v2.length = 8; + v2.aggregation = OLAP_FIELD_AGGREGATION_MIN; + v2.is_key = false; + v2.is_allow_null = true; + tablet_schema->push_back(v2); + + FieldInfo v3; + v3.name = "v3"; + v3.type = OLAP_FIELD_TYPE_DECIMAL; + v3.length = 12; + v3.aggregation = OLAP_FIELD_AGGREGATION_MAX; + v3.is_key = false; + v3.is_allow_null = true; + tablet_schema->push_back(v3); + + FieldInfo v4; + v4.name = "v4"; + v4.type = OLAP_FIELD_TYPE_VARCHAR; + v4.length = 16 + OLAP_STRING_MAX_BYTES; + v4.aggregation = OLAP_FIELD_AGGREGATION_REPLACE; + v4.is_key = false; + v4.is_allow_null = true; + tablet_schema->push_back(v4); +} + +class TestRowCursor : public testing::Test { +public: + TestRowCursor() { + _mem_tracker.reset(new MemTracker(-1)); + _mem_pool.reset(new MemPool(_mem_tracker.get())); + } + + virtual void SetUp() {} + + virtual void TearDown() {} + + std::unique_ptr _mem_tracker; + std::unique_ptr _mem_pool; +}; + +TEST_F(TestRowCursor, InitRowCursor) { + std::vector tablet_schema; + set_tablet_schema_for_init(&tablet_schema); + RowCursor row; + OLAPStatus res = row.init(tablet_schema); + ASSERT_EQ(res, OLAP_SUCCESS); + ASSERT_EQ(row.get_fixed_len(), 126); + ASSERT_EQ(row.get_variable_len(), 16413); +} + +TEST_F(TestRowCursor, InitRowCursorWithColumnCount) { + std::vector tablet_schema; + set_tablet_schema_for_init(&tablet_schema); + RowCursor row; + OLAPStatus res = row.init(tablet_schema, 5); + ASSERT_EQ(res, OLAP_SUCCESS); + ASSERT_EQ(row.get_fixed_len(), 23); + ASSERT_EQ(row.get_variable_len(), 0); + row.allocate_memory_for_string_type(tablet_schema); + ASSERT_EQ(row.get_variable_len(), 0); +} + +TEST_F(TestRowCursor, InitRowCursorWithColIds) { + std::vector tablet_schema; + set_tablet_schema_for_init(&tablet_schema); + + std::vector col_ids; + for (size_t i = 0; i < tablet_schema.size() / 2; ++i) { + col_ids.push_back(i * 2); + } + + RowCursor row; + OLAPStatus res = row.init(tablet_schema, col_ids); + ASSERT_EQ(res, OLAP_SUCCESS); + ASSERT_EQ(row.get_fixed_len(), 63); + ASSERT_EQ(row.get_variable_len(), 20); +} + +TEST_F(TestRowCursor, InitRowCursorWithScanKey) { + std::vector tablet_schema; + set_tablet_schema_for_scan_key(&tablet_schema); + + std::vector scan_keys; + scan_keys.push_back("char_exceed_length"); + scan_keys.push_back("varchar_exceed_length"); + + RowCursor row; + OLAPStatus res = row.init_scan_key(tablet_schema, scan_keys); + ASSERT_EQ(res, OLAP_SUCCESS); + ASSERT_EQ(row.get_fixed_len(), 34); + ASSERT_EQ(row.get_variable_len(), 39); + + res = row.from_string(scan_keys); + ASSERT_EQ(res, OLAP_SUCCESS); + + std::vector vec_string = row.to_string_vector(); + ASSERT_TRUE(strncmp(vec_string[0].c_str(), "0&char_exceed_length", vec_string[0].size())); + ASSERT_TRUE(strncmp(vec_string[1].c_str(), "0&varchar_exceed_length", vec_string[1].size())); +} + +TEST_F(TestRowCursor, SetMinAndMaxKey) { + std::vector tablet_schema; + set_tablet_schema_for_init(&tablet_schema); + + RowCursor min_row; + OLAPStatus res = min_row.init(tablet_schema); + ASSERT_EQ(res, OLAP_SUCCESS); + ASSERT_EQ(min_row.get_fixed_len(), 126); + + res = min_row.build_min_key(); + ASSERT_EQ(res, OLAP_SUCCESS); + for (size_t i = 0; i < tablet_schema.size(); ++i) { + ASSERT_TRUE(min_row.is_min(i)); + } + + RowCursor max_row; + res = max_row.init(tablet_schema); + ASSERT_EQ(res, OLAP_SUCCESS); + ASSERT_EQ(max_row.get_fixed_len(), 126); +} + +TEST_F(TestRowCursor, EqualAndCompare) { + std::vector tablet_schema; + set_tablet_schema_for_cmp_and_aggregate(&tablet_schema); + + RowCursor left; + OLAPStatus res = left.init(tablet_schema); + ASSERT_EQ(res, OLAP_SUCCESS); + ASSERT_EQ(left.get_fixed_len(), 78); + ASSERT_EQ(left.get_variable_len(), 20); + + StringSlice l_char("well"); + int32_t l_int = 10; + left.set_field_content(0, reinterpret_cast(&l_char), _mem_pool.get()); + left.set_field_content(1, reinterpret_cast(&l_int), _mem_pool.get()); + + // right row only has k2 in int type + std::vector col_ids; + col_ids.push_back(1); + + RowCursor right_eq; + res = right_eq.init(tablet_schema, col_ids); + int32_t r_int_eq = 10; + right_eq.set_field_content(1, reinterpret_cast(&r_int_eq), _mem_pool.get()); + ASSERT_TRUE(left.equal(right_eq)); + ASSERT_EQ(left.cmp(right_eq), 0); + + RowCursor right_lt; + res = right_lt.init(tablet_schema, col_ids); + int32_t r_int_lt = 11; + right_lt.set_field_content(1, reinterpret_cast(&r_int_lt), _mem_pool.get()); + ASSERT_FALSE(left.equal(right_lt)); + ASSERT_LT(left.cmp(right_lt), 0); + + RowCursor right_gt; + res = right_gt.init(tablet_schema, col_ids); + int32_t r_int_gt = 9; + right_gt.set_field_content(1, reinterpret_cast(&r_int_gt), _mem_pool.get()); + ASSERT_FALSE(left.equal(right_gt)); + ASSERT_GT(left.cmp(right_gt), 0); +} + +TEST_F(TestRowCursor, IndexCmp) { + std::vector tablet_schema; + set_tablet_schema_for_cmp_and_aggregate(&tablet_schema); + + RowCursor left; + OLAPStatus res = left.init(tablet_schema); + ASSERT_EQ(res, OLAP_SUCCESS); + ASSERT_EQ(left.get_fixed_len(), 78); + ASSERT_EQ(left.get_variable_len(), 20); + + StringSlice l_char("well"); + int32_t l_int = 10; + left.set_field_content(0, reinterpret_cast(&l_char), _mem_pool.get()); + left.set_field_content(1, reinterpret_cast(&l_int), _mem_pool.get()); + + RowCursor right_eq; + res = right_eq.init(tablet_schema); + StringSlice r_char_eq("well"); + int32_t r_int_eq = 10; + right_eq.set_field_content(0, reinterpret_cast(&r_char_eq), _mem_pool.get()); + right_eq.set_field_content(1, reinterpret_cast(&r_int_eq), _mem_pool.get()); + + ASSERT_EQ(left.index_cmp(right_eq), 0); + + RowCursor right_lt; + res = right_lt.init(tablet_schema); + StringSlice r_char_lt("well"); + int32_t r_int_lt = 11; + right_lt.set_field_content(0, reinterpret_cast(&r_char_lt), _mem_pool.get()); + right_lt.set_field_content(1, reinterpret_cast(&r_int_lt), _mem_pool.get()); + ASSERT_LT(left.index_cmp(right_lt), 0); + + RowCursor right_gt; + res = right_gt.init(tablet_schema); + StringSlice r_char_gt("good"); + int32_t r_int_gt = 10; + right_gt.set_field_content(0, reinterpret_cast(&r_char_gt), _mem_pool.get()); + right_gt.set_field_content(1, reinterpret_cast(&r_int_gt), _mem_pool.get()); + ASSERT_GT(left.index_cmp(right_gt), 0); +} + +TEST_F(TestRowCursor, FullKeyCmp) { + std::vector tablet_schema; + set_tablet_schema_for_cmp_and_aggregate(&tablet_schema); + + RowCursor left; + OLAPStatus res = left.init(tablet_schema); + ASSERT_EQ(res, OLAP_SUCCESS); + ASSERT_EQ(left.get_fixed_len(), 78); + ASSERT_EQ(left.get_variable_len(), 20); + + StringSlice l_char("well"); + int32_t l_int = 10; + left.set_field_content(0, reinterpret_cast(&l_char), _mem_pool.get()); + left.set_field_content(1, reinterpret_cast(&l_int), _mem_pool.get()); + + RowCursor right_eq; + res = right_eq.init(tablet_schema); + StringSlice r_char_eq("well"); + int32_t r_int_eq = 10; + right_eq.set_field_content(0, reinterpret_cast(&r_char_eq), _mem_pool.get()); + right_eq.set_field_content(1, reinterpret_cast(&r_int_eq), _mem_pool.get()); + ASSERT_EQ(left.full_key_cmp(right_eq), 0); + + RowCursor right_lt; + res = right_lt.init(tablet_schema); + StringSlice r_char_lt("well"); + int32_t r_int_lt = 11; + right_lt.set_field_content(0, reinterpret_cast(&r_char_lt), _mem_pool.get()); + right_lt.set_field_content(1, reinterpret_cast(&r_int_lt), _mem_pool.get()); + ASSERT_LT(left.full_key_cmp(right_lt), 0); + + RowCursor right_gt; + res = right_gt.init(tablet_schema); + StringSlice r_char_gt("good"); + int32_t r_int_gt = 10; + right_gt.set_field_content(0, reinterpret_cast(&r_char_gt), _mem_pool.get()); + right_gt.set_field_content(1, reinterpret_cast(&r_int_gt), _mem_pool.get()); + ASSERT_GT(left.full_key_cmp(right_gt), 0); +} + +TEST_F(TestRowCursor, AggregateWithoutNull) { + std::vector tablet_schema; + set_tablet_schema_for_cmp_and_aggregate(&tablet_schema); + + RowCursor row; + OLAPStatus res = row.init(tablet_schema); + ASSERT_EQ(res, OLAP_SUCCESS); + ASSERT_EQ(row.get_fixed_len(), 78); + ASSERT_EQ(row.get_variable_len(), 20); + row.allocate_memory_for_string_type(tablet_schema); + + RowCursor left; + res = left.init(tablet_schema); + + StringSlice l_char("well"); + int32_t l_int = 10; + int128_t l_largeint = (int128_t)(1) << 100; + double l_double = 8.8; + decimal12_t l_decimal(11, 22); + StringSlice l_varchar("beijing"); + left.set_field_content(0, reinterpret_cast(&l_char), _mem_pool.get()); + left.set_field_content(1, reinterpret_cast(&l_int), _mem_pool.get()); + left.set_field_content(2, reinterpret_cast(&l_largeint), _mem_pool.get()); + left.set_field_content(3, reinterpret_cast(&l_double), _mem_pool.get()); + left.set_field_content(4, reinterpret_cast(&l_decimal), _mem_pool.get()); + left.set_field_content(5, reinterpret_cast(&l_varchar), _mem_pool.get()); + + res = row.agg_init(left); + ASSERT_EQ(res, OLAP_SUCCESS); + + RowCursor right; + res = right.init(tablet_schema); + StringSlice r_char("well"); + int32_t r_int = 10; + int128_t r_largeint = (int128_t)(1) << 100; + double r_double = 5.5; + decimal12_t r_decimal(22, 22); + StringSlice r_varchar("shenzhen"); + right.set_field_content(0, reinterpret_cast(&r_char), _mem_pool.get()); + right.set_field_content(1, reinterpret_cast(&r_int), _mem_pool.get()); + right.set_field_content(2, reinterpret_cast(&r_largeint), _mem_pool.get()); + right.set_field_content(3, reinterpret_cast(&r_double), _mem_pool.get()); + right.set_field_content(4, reinterpret_cast(&r_decimal), _mem_pool.get()); + right.set_field_content(5, reinterpret_cast(&r_varchar), _mem_pool.get()); + + row.aggregate(right); + + int128_t agg_value = *reinterpret_cast(row.get_field_content_ptr(2)); + ASSERT_TRUE(agg_value == ((int128_t)(1) << 101)); + + double agg_double = *reinterpret_cast(row.get_field_content_ptr(3)); + ASSERT_TRUE(agg_double == r_double); + + decimal12_t agg_decimal = *reinterpret_cast(row.get_field_content_ptr(4)); + ASSERT_TRUE(agg_decimal == r_decimal); + + StringSlice* agg_varchar = reinterpret_cast(row.get_field_content_ptr(5)); + ASSERT_EQ(agg_varchar->compare(r_varchar), 0); +} + +TEST_F(TestRowCursor, AggregateWithNull) { + std::vector tablet_schema; + set_tablet_schema_for_cmp_and_aggregate(&tablet_schema); + + RowCursor row; + OLAPStatus res = row.init(tablet_schema); + ASSERT_EQ(res, OLAP_SUCCESS); + ASSERT_EQ(row.get_fixed_len(), 78); + ASSERT_EQ(row.get_variable_len(), 20); + row.allocate_memory_for_string_type(tablet_schema); + + RowCursor left; + res = left.init(tablet_schema); + + StringSlice l_char("well"); + int32_t l_int = 10; + int128_t l_largeint = (int128_t)(1) << 100; + StringSlice l_varchar("beijing"); + left.set_field_content(0, reinterpret_cast(&l_char), _mem_pool.get()); + left.set_field_content(1, reinterpret_cast(&l_int), _mem_pool.get()); + left.set_field_content(2, reinterpret_cast(&l_largeint), _mem_pool.get()); + left.set_null(3); + left.set_null(4); + left.set_field_content(5, reinterpret_cast(&l_varchar), _mem_pool.get()); + + res = row.agg_init(left); + ASSERT_EQ(res, OLAP_SUCCESS); + + RowCursor right; + res = right.init(tablet_schema); + StringSlice r_char("well"); + int32_t r_int = 10; + int128_t r_largeint = (int128_t)(1) << 100; + double r_double = 5.5; + decimal12_t r_decimal(22, 22); + right.set_field_content(0, reinterpret_cast(&r_char), _mem_pool.get()); + right.set_field_content(1, reinterpret_cast(&r_int), _mem_pool.get()); + right.set_field_content(2, reinterpret_cast(&r_largeint), _mem_pool.get()); + right.set_field_content(3, reinterpret_cast(&r_double), _mem_pool.get()); + right.set_field_content(4, reinterpret_cast(&r_decimal), _mem_pool.get()); + right.set_null(5); + + row.aggregate(right); + + int128_t agg_value = *reinterpret_cast(row.get_field_content_ptr(2)); + ASSERT_TRUE(agg_value == ((int128_t)(1) << 101)); + + bool is_null_double = left.is_null(3); + ASSERT_TRUE(is_null_double); + + decimal12_t agg_decimal = *reinterpret_cast(row.get_field_content_ptr(4)); + ASSERT_TRUE(agg_decimal == r_decimal); + + bool is_null_varchar = row.is_null(5); + ASSERT_TRUE(is_null_varchar); +} + +} // namespace palo + +int main(int argc, char** argv) { + std::string conffile = std::string(getenv("PALO_HOME")) + "/conf/be.conf"; + if (!palo::config::init(conffile.c_str(), false)) { + fprintf(stderr, "error read config file. \n"); + return -1; + } + palo::init_glog("be-test"); + int ret = palo::OLAP_SUCCESS; + testing::InitGoogleTest(&argc, argv); + ret = RUN_ALL_TESTS(); + return ret; +} diff --git a/be/test/olap/run_length_byte_test.cpp b/be/test/olap/run_length_byte_test.cpp index 51f89f4288..81058520fb 100755 --- a/be/test/olap/run_length_byte_test.cpp +++ b/be/test/olap/run_length_byte_test.cpp @@ -508,9 +508,9 @@ TEST(TestStream, SeekUncompress) { index_entry.write_to_buffer(buffer); StreamIndexHeader header; header.position_format = index_entry.positions_count(); - header.statistic_format = OLAP_FIELD_TYPE_NONE; + header.statistic_format = OLAP_FIELD_TYPE_TINYINT; PositionEntryReader entry; - entry.init(&header, OLAP_FIELD_TYPE_NONE, false); + entry.init(&header, OLAP_FIELD_TYPE_TINYINT, false); entry.attach(buffer); PositionProvider position(&entry); @@ -602,9 +602,9 @@ TEST(TestStream, SeekCompress) { index_entry.write_to_buffer(buffer); StreamIndexHeader header; header.position_format = index_entry.positions_count(); - header.statistic_format = OLAP_FIELD_TYPE_NONE; + header.statistic_format = OLAP_FIELD_TYPE_TINYINT; PositionEntryReader entry; - entry.init(&header, OLAP_FIELD_TYPE_NONE, false); + entry.init(&header, OLAP_FIELD_TYPE_TINYINT, false); entry.attach(buffer); PositionProvider position(&entry); @@ -697,7 +697,8 @@ public: 0, helper.length(), NULL, - OLAP_DEFAULT_COLUMN_STREAM_BUFFER_SIZE); + OLAP_DEFAULT_COLUMN_STREAM_BUFFER_SIZE, + &_stats); ASSERT_EQ(OLAP_SUCCESS, _stream->init()); _reader = new (std::nothrow) RunLengthByteReader(_stream); @@ -710,6 +711,7 @@ public: FileHandler helper; ByteBuffer* _shared_buffer; ReadOnlyFileStream* _stream; + OlapReaderStatistics _stats; }; @@ -803,9 +805,9 @@ TEST_F(TestRunLengthByte, Seek) { index_entry.write_to_buffer(buffer); StreamIndexHeader header; header.position_format = index_entry.positions_count(); - header.statistic_format = OLAP_FIELD_TYPE_NONE; + header.statistic_format = OLAP_FIELD_TYPE_TINYINT; PositionEntryReader entry; - entry.init(&header, OLAP_FIELD_TYPE_NONE, false); + entry.init(&header, OLAP_FIELD_TYPE_TINYINT, false); entry.attach(buffer); PositionProvider position(&entry); diff --git a/be/test/olap/run_length_integer_test.cpp b/be/test/olap/run_length_integer_test.cpp index 7cf3822ab1..4f13442022 100755 --- a/be/test/olap/run_length_integer_test.cpp +++ b/be/test/olap/run_length_integer_test.cpp @@ -71,7 +71,8 @@ public: 0, helper.length(), NULL, - OLAP_DEFAULT_COLUMN_STREAM_BUFFER_SIZE); + OLAP_DEFAULT_COLUMN_STREAM_BUFFER_SIZE, + &_stats); ASSERT_EQ(OLAP_SUCCESS, _stream->init()); _reader = new (std::nothrow) RunLengthIntegerReader(_stream, false); @@ -84,6 +85,7 @@ public: FileHandler helper; ByteBuffer* _shared_buffer; ReadOnlyFileStream* _stream; + OlapReaderStatistics _stats; }; @@ -148,7 +150,7 @@ TEST_F(TestRunLengthUnsignInteger, seek) { PositionEntryReader entry; entry._positions = index_entry._positions; entry._positions_count = index_entry._positions_count; - entry._statistics.init(OLAP_FIELD_TYPE_NONE, false); + entry._statistics.init(OLAP_FIELD_TYPE_INT, false); PositionProvider position(&entry); _reader->seek(&position); @@ -381,7 +383,8 @@ virtual void SetUp() { 0, helper.length(), NULL, - OLAP_DEFAULT_COLUMN_STREAM_BUFFER_SIZE); + OLAP_DEFAULT_COLUMN_STREAM_BUFFER_SIZE, + &_stats); ASSERT_EQ(OLAP_SUCCESS, _stream->init()); _reader = new (std::nothrow) RunLengthIntegerReader(_stream, false); @@ -394,6 +397,7 @@ virtual void SetUp() { FileHandler helper; ByteBuffer* _shared_buffer; ReadOnlyFileStream* _stream; + OlapReaderStatistics _stats; }; @@ -474,7 +478,7 @@ TEST_F(TestRunLengthSignInteger, seek) { PositionEntryReader entry; entry._positions = index_entry._positions; entry._positions_count = index_entry._positions_count; - entry._statistics.init(OLAP_FIELD_TYPE_NONE, false); + entry._statistics.init(OLAP_FIELD_TYPE_INT, false); PositionProvider position(&entry); _reader->seek(&position); diff --git a/be/test/olap/stream_index_test.cpp b/be/test/olap/stream_index_test.cpp index 14d108a4af..d55fd20907 100755 --- a/be/test/olap/stream_index_test.cpp +++ b/be/test/olap/stream_index_test.cpp @@ -16,7 +16,6 @@ #include #include -#include "olap/field.h" #include "olap/olap_cond.h" #include "olap/olap_define.h" #include "olap/olap_engine.h" @@ -24,6 +23,7 @@ #include "olap/olap_table.h" #include "olap/olap_common.h" #include "olap/row_cursor.h" +#include "olap/wrapper_field.h" #include "olap/column_file/stream_index_common.h" #include "olap/column_file/stream_index_writer.h" #include "olap/column_file/stream_index_reader.h" @@ -51,19 +51,27 @@ public: TEST_F(TestStreamIndex, index_write) { StreamIndexWriter writer(OLAP_FIELD_TYPE_INT); PositionEntryWriter entry; + ColumnStatistics stat; + stat.init(OLAP_FIELD_TYPE_INT, true); + + ASSERT_EQ(OLAP_SUCCESS, stat.init(OLAP_FIELD_TYPE_INT, true)); static const uint32_t loop = 10; uint32_t i = 0; for (; i < loop; i++) { entry.add_position(i); - entry.add_position(i*2); - entry.add_position(i*3); + entry.add_position(i * 2); + entry.add_position(i * 3); + entry.set_statistic(&stat); writer.add_index_entry(entry); entry.reset_write_offset(); } size_t output_size = sizeof(StreamIndexHeader) + i * sizeof(uint32_t) * 3; + // for statistics + output_size += (sizeof(int) + 1) * 2 * loop; + ASSERT_EQ(output_size, writer.output_size()); char* buffer = new char[output_size]; @@ -71,7 +79,7 @@ TEST_F(TestStreamIndex, index_write) { ASSERT_EQ(OLAP_SUCCESS, writer.write_to_buffer(buffer, output_size)); StreamIndexReader reader; - ASSERT_EQ(OLAP_SUCCESS, reader.init(buffer, output_size, OLAP_FIELD_TYPE_NONE, true, false)); + ASSERT_EQ(OLAP_SUCCESS, reader.init(buffer, output_size, OLAP_FIELD_TYPE_INT, true, true)); ASSERT_EQ(loop, reader.entry_count()); @@ -86,6 +94,8 @@ TEST_F(TestStreamIndex, index_write) { TEST_F(TestStreamIndex, remove_written_position) { StreamIndexWriter writer(OLAP_FIELD_TYPE_INT); PositionEntryWriter entry; + ColumnStatistics stat; + stat.init(OLAP_FIELD_TYPE_INT, true); static const uint32_t loop = 10; //test 1 @@ -100,6 +110,7 @@ TEST_F(TestStreamIndex, remove_written_position) { entry.add_position(i*6); entry.add_position(i*7); + entry.set_statistic(&stat); writer.add_index_entry(entry); entry.reset_write_offset(); } @@ -110,14 +121,13 @@ TEST_F(TestStreamIndex, remove_written_position) { } size_t output_size = writer.output_size(); - char* buffer = new char[output_size]; ASSERT_EQ(OLAP_SUCCESS, writer.write_to_buffer(buffer, output_size)); StreamIndexReader reader; ASSERT_EQ(OLAP_SUCCESS, - reader.init(buffer, output_size, OLAP_FIELD_TYPE_NONE, true, false)); + reader.init(buffer, output_size, OLAP_FIELD_TYPE_INT, true, true)); ASSERT_EQ(loop, reader.entry_count()); @@ -142,6 +152,7 @@ TEST_F(TestStreamIndex, remove_written_position) { entry.add_position(i*6); entry.add_position(i*7); + entry.set_statistic(&stat); writer.add_index_entry(entry); entry.reset_write_offset(); } @@ -152,14 +163,13 @@ TEST_F(TestStreamIndex, remove_written_position) { } size_t output_size = writer.output_size(); - char* buffer = new char[output_size]; ASSERT_EQ(OLAP_SUCCESS, writer.write_to_buffer(buffer, output_size)); StreamIndexReader reader; ASSERT_EQ(OLAP_SUCCESS, - reader.init(buffer, output_size, OLAP_FIELD_TYPE_NONE, true, false)); + reader.init(buffer, output_size, OLAP_FIELD_TYPE_INT, true, true)); ASSERT_EQ(loop, reader.entry_count()); @@ -185,6 +195,7 @@ TEST_F(TestStreamIndex, remove_written_position) { entry.add_position(i*6); entry.add_position(i*7); + entry.set_statistic(&stat); writer.add_index_entry(entry); entry.reset_write_offset(); } @@ -195,14 +206,13 @@ TEST_F(TestStreamIndex, remove_written_position) { } size_t output_size = writer.output_size(); - char* buffer = new char[output_size]; ASSERT_EQ(OLAP_SUCCESS, writer.write_to_buffer(buffer, output_size)); StreamIndexReader reader; ASSERT_EQ(OLAP_SUCCESS, - reader.init(buffer, output_size, OLAP_FIELD_TYPE_NONE, true, false)); + reader.init(buffer, output_size, OLAP_FIELD_TYPE_INT, true, true)); ASSERT_EQ(loop, reader.entry_count()); @@ -228,6 +238,7 @@ TEST_F(TestStreamIndex, remove_written_position) { entry.add_position(i*6); entry.add_position(i*7); + entry.set_statistic(&stat); writer.add_index_entry(entry); entry.reset_write_offset(); } @@ -240,12 +251,11 @@ TEST_F(TestStreamIndex, remove_written_position) { size_t output_size = writer.output_size(); char* buffer = new char[output_size]; - ASSERT_EQ(OLAP_SUCCESS, writer.write_to_buffer(buffer, output_size)); StreamIndexReader reader; ASSERT_EQ(OLAP_SUCCESS, - reader.init(buffer, output_size, OLAP_FIELD_TYPE_NONE, true, false)); + reader.init(buffer, output_size, OLAP_FIELD_TYPE_INT, true, true)); ASSERT_EQ(loop, reader.entry_count()); @@ -264,8 +274,7 @@ TEST_F(TestStreamIndex, test_statistic) { ColumnStatistics stat; ASSERT_EQ(OLAP_SUCCESS, stat.init(OLAP_FIELD_TYPE_INT, true)); - Field* field = Field::create_by_type(OLAP_FIELD_TYPE_INT); - ASSERT_TRUE(field->allocate()); + WrapperField* field = WrapperField::create_by_type(OLAP_FIELD_TYPE_INT); // start ASSERT_STREQ(stat.minimum()->to_string().c_str(), "2147483647"); @@ -273,25 +282,25 @@ TEST_F(TestStreamIndex, test_statistic) { // 1 field->from_string("3"); - stat.add(field); + stat.add(field->field_ptr()); ASSERT_STREQ(stat.minimum()->to_string().c_str(), "3"); ASSERT_STREQ(stat.maximum()->to_string().c_str(), "3"); // 2 field->from_string("5"); - stat.add(field); + stat.add(field->field_ptr()); ASSERT_STREQ(stat.minimum()->to_string().c_str(), "3"); ASSERT_STREQ(stat.maximum()->to_string().c_str(), "5"); // 3 field->from_string("899"); - stat.add(field); + stat.add(field->field_ptr()); ASSERT_STREQ(stat.minimum()->to_string().c_str(), "3"); ASSERT_STREQ(stat.maximum()->to_string().c_str(), "899"); // 4 field->from_string("-111"); - stat.add(field); + stat.add(field->field_ptr()); ASSERT_STREQ(stat.minimum()->to_string().c_str(), "-111"); ASSERT_STREQ(stat.maximum()->to_string().c_str(), "899"); @@ -301,9 +310,9 @@ TEST_F(TestStreamIndex, test_statistic) { ASSERT_STREQ(stat.maximum()->to_string().c_str(), "-2147483648"); field->from_string("3"); - stat.add(field); + stat.add(field->field_ptr()); field->from_string("6"); - stat.add(field); + stat.add(field->field_ptr()); ASSERT_STREQ(stat.minimum()->to_string().c_str(), "3"); ASSERT_STREQ(stat.maximum()->to_string().c_str(), "6"); @@ -325,9 +334,8 @@ TEST_F(TestStreamIndex, statistic) { ASSERT_EQ(OLAP_SUCCESS, stat.init(OLAP_FIELD_TYPE_INT, true)); - Field* field = Field::create_by_type(OLAP_FIELD_TYPE_INT); + WrapperField* field = WrapperField::create_by_type(OLAP_FIELD_TYPE_INT); ASSERT_TRUE(NULL != field); - ASSERT_TRUE(field->allocate()); char string_buffer[256]; static const uint32_t loop = 10; @@ -339,19 +347,14 @@ TEST_F(TestStreamIndex, statistic) { snprintf(string_buffer, sizeof(string_buffer), "%d", i * 9); field->from_string(string_buffer); - stat.add(field); + stat.add(field->field_ptr()); snprintf(string_buffer, sizeof(string_buffer), "%d", i * 2); field->from_string(string_buffer); - stat.add(field); + stat.add(field->field_ptr()); - printf("%d. max %s\n", i, stat.maximum()->to_string().c_str()); - printf("%d. min %s\n", i, stat.minimum()->to_string().c_str()); entry.set_statistic(&stat); - printf("%d. min -> %d\n", i, entry._statistics_buffer[0]); - printf("%d. max -> %d\n", i, entry._statistics_buffer[1]); - writer.add_index_entry(entry); entry.reset_write_offset(); } @@ -375,11 +378,6 @@ TEST_F(TestStreamIndex, statistic) { ASSERT_EQ(e.positions(0), i); ASSERT_EQ(e.positions(1), i * 2); ASSERT_EQ(e.positions(2), i * 3); - - if (!e.column_statistic().ignored()) { - printf("%d. max %s\n", i, e.column_statistic().maximum()->to_string().c_str()); - printf("%d. min %s\n", i, e.column_statistic().minimum()->to_string().c_str()); - } } } diff --git a/be/test/runtime/CMakeLists.txt b/be/test/runtime/CMakeLists.txt index b8f5eb28c6..45c400c682 100644 --- a/be/test/runtime/CMakeLists.txt +++ b/be/test/runtime/CMakeLists.txt @@ -22,30 +22,30 @@ set(EXECUTABLE_OUTPUT_PATH "${BUILD_DIR}/test/runtime") #ADD_BE_TEST(buffered_tuple_stream_test) -#ADD_BE_TEST(sorter_test) +# ADD_BE_TEST(sorter_test) #ADD_BE_TEST(result_writer_test) #ADD_BE_TEST(buffer_control_block_test) #ADD_BE_TEST(result_buffer_mgr_test) #ADD_BE_TEST(result_sink_test) ADD_BE_TEST(mem_pool_test) -#ADD_BE_TEST(free_list_test) -#ADD_BE_TEST(string_buffer_test) +ADD_BE_TEST(free_list_test) +ADD_BE_TEST(string_buffer_test) # ADD_BE_TEST(data_stream_test) #ADD_BE_TEST(disk_io_mgr_test) #ADD_BE_TEST(parallel_executor_test) -#ADD_BE_TEST(datetime_value_test) -#ADD_BE_TEST(decimal_value_test) -#ADD_BE_TEST(large_int_value_test) -#ADD_BE_TEST(string_value_test) +ADD_BE_TEST(datetime_value_test) +ADD_BE_TEST(decimal_value_test) +ADD_BE_TEST(large_int_value_test) +ADD_BE_TEST(string_value_test) #ADD_BE_TEST(thread_resource_mgr_test) -#ADD_BE_TEST(dpp_writer_test) +# ADD_BE_TEST(dpp_writer_test) #ADD_BE_TEST(qsorter_test) -#ADD_BE_TEST(fragment_mgr_test) +ADD_BE_TEST(fragment_mgr_test) #ADD_BE_TEST(dpp_sink_internal_test) #ADD_BE_TEST(dpp_sink_test) #ADD_BE_TEST(data_spliter_test) #ADD_BE_TEST(etl_job_mgr_test) -#ADD_BE_TEST(mysql_table_writer_test) +# ADD_BE_TEST(mysql_table_writer_test) ADD_BE_TEST(pull_load_task_mgr_test) ADD_BE_TEST(tmp_file_mgr_test) diff --git a/be/test/runtime/datetime_value_test.cpp b/be/test/runtime/datetime_value_test.cpp index 4e000858ec..b905941400 100644 --- a/be/test/runtime/datetime_value_test.cpp +++ b/be/test/runtime/datetime_value_test.cpp @@ -38,7 +38,7 @@ protected: // Assert size TEST_F(DateTimeValueTest, struct_size) { - ASSERT_EQ(12, sizeof(DateTimeValue)); + ASSERT_EQ(16, sizeof(DateTimeValue)); } TEST_F(DateTimeValueTest, equal) { @@ -294,7 +294,7 @@ TEST_F(DateTimeValueTest, from_unixtime) { value.from_unixtime(570672000); value.to_string(str); - ASSERT_STREQ("1988-02-01", str); + ASSERT_STREQ("1988-02-01 08:00:00", str); } // Calculate format @@ -306,11 +306,11 @@ TEST_F(DateTimeValueTest, unix_timestamp) { value.from_date_int64(19700101); ASSERT_EQ(0, value.unix_timestamp()); value.from_date_int64(19700102); - ASSERT_EQ(86400, value.unix_timestamp()); + ASSERT_EQ(86400 - 28800, value.unix_timestamp()); value.from_date_int64(19880201000000); - ASSERT_EQ(570672000, value.unix_timestamp()); + ASSERT_EQ(570672000 - 28800, value.unix_timestamp()); value.from_date_int64(20380119); - ASSERT_EQ(2147472000, value.unix_timestamp()); + ASSERT_EQ(2147472000 - 28800, value.unix_timestamp()); value.from_date_int64(20380120); ASSERT_EQ(0, value.unix_timestamp()); } diff --git a/be/test/runtime/fragment_mgr_test.cpp b/be/test/runtime/fragment_mgr_test.cpp index 1a02d524cc..94fdeae09d 100644 --- a/be/test/runtime/fragment_mgr_test.cpp +++ b/be/test/runtime/fragment_mgr_test.cpp @@ -16,7 +16,7 @@ #include #include "runtime/fragment_mgr.h" #include "runtime/plan_fragment_executor.h" -#include "runtime/mem_limit.hpp" +// #include "runtime/mem_limit.hpp" #include "runtime/row_batch.h" #include "exec/data_sink.h" #include "common/configbase.h" @@ -120,5 +120,6 @@ TEST_F(FragmentMgrTest, PrepareFailed) { int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); + palo::CpuInfo::init(); return RUN_ALL_TESTS(); } diff --git a/be/test/runtime/free_list_test.cpp b/be/test/runtime/free_list_test.cpp index c663210137..7612e9c7fa 100644 --- a/be/test/runtime/free_list_test.cpp +++ b/be/test/runtime/free_list_test.cpp @@ -18,16 +18,18 @@ #include "runtime/free_list.hpp" #include "runtime/mem_pool.h" +#include "runtime/mem_tracker.h" namespace palo { TEST(FreeListTest, Basic) { - MemPool pool; + MemTracker tracker; + MemPool pool(&tracker); FreeList list; int allocated_size; uint8_t* free_list_mem = list.allocate(FreeList::min_size(), &allocated_size); - EXPECT_EQ(NULL, free_list_mem); + EXPECT_EQ(nullptr, free_list_mem); EXPECT_EQ(allocated_size, 0); uint8_t* mem = pool.allocate(FreeList::min_size()); @@ -39,7 +41,7 @@ TEST(FreeListTest, Basic) { EXPECT_EQ(allocated_size, FreeList::min_size()); free_list_mem = list.allocate(FreeList::min_size(), &allocated_size); - EXPECT_EQ(NULL, free_list_mem); + EXPECT_EQ(nullptr, free_list_mem); EXPECT_EQ(allocated_size, 0); // Make 3 allocations and add them to the free list. @@ -48,9 +50,9 @@ TEST(FreeListTest, Basic) { // Attempt a 4th allocation from the free list and make sure // we get NULL. // Repeat with the same memory blocks. - uint8_t* free_list_mem1 = NULL; - uint8_t* free_list_mem2 = null; - uint8_t* free_list_mem3 = null; + uint8_t* free_list_mem1 = nullptr; + uint8_t* free_list_mem2 = nullptr; + uint8_t* free_list_mem3 = nullptr; mem = pool.allocate(FreeList::min_size()); list.add(mem, FreeList::min_size()); @@ -147,12 +149,14 @@ TEST(FreeListTest, Basic) { } int main(int argc, char** argv) { +#if 0 std::string conffile = std::string(getenv("PALO_HOME")) + "/conf/be.conf"; if (!palo::config::init(conffile.c_str(), false)) { fprintf(stderr, "error read config file. \n"); return -1; } init_glog("be-test"); +#endif ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } diff --git a/be/test/runtime/mem_pool_test.cpp b/be/test/runtime/mem_pool_test.cpp index c1676706b2..9ff1b244b7 100644 --- a/be/test/runtime/mem_pool_test.cpp +++ b/be/test/runtime/mem_pool_test.cpp @@ -156,23 +156,15 @@ TEST(MemPoolTest, ReturnPartial) { p.free_all(); } -// Utility class to call private functions on MemPool. -class MemPoolTest { - public: - static bool check_integrity(MemPool* pool, bool current_chunk_empty) { - return pool->check_integrity(current_chunk_empty); - } -}; - TEST(MemPoolTest, Limits) { MemTracker limit3(320); MemTracker limit1(160, "", &limit3); MemTracker limit2(240, "", &limit3); - MemPool* p1 = new MemPool(&limit1, 80); + MemPool* p1 = new MemPool(&limit1); EXPECT_FALSE(limit1.any_limit_exceeded()); - MemPool* p2 = new MemPool(&limit2, 80); + MemPool* p2 = new MemPool(&limit2); EXPECT_FALSE(limit2.any_limit_exceeded()); // p1 exceeds a non-shared limit @@ -213,18 +205,15 @@ TEST(MemPoolTest, Limits) { EXPECT_FALSE(limit2.limit_exceeded()); uint8_t* result = p2->try_allocate(160); DCHECK(result != NULL); - DCHECK(MemPoolTest::check_integrity(p2, false)); // Try To allocate another 160 bytes, this should fail. result = p2->try_allocate(160); DCHECK(result == NULL); - DCHECK(MemPoolTest::check_integrity(p2, false)); // Try To allocate 20 bytes, this should succeed. try_allocate() should leave the // pool in a functional state.. result = p2->try_allocate(20); DCHECK(result != NULL); - DCHECK(MemPoolTest::check_integrity(p2, false)); p2->free_all(); delete p2; @@ -299,8 +288,8 @@ int main(int argc, char** argv) { // return -1; // } palo::init_glog("be-test"); + ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); } diff --git a/be/test/runtime/sorter_test.cpp b/be/test/runtime/sorter_test.cpp index c46cf3cd87..1df94dc695 100644 --- a/be/test/runtime/sorter_test.cpp +++ b/be/test/runtime/sorter_test.cpp @@ -22,7 +22,7 @@ #include "runtime/row_batch.h" #include "runtime/tuple_row.h" #include "runtime/runtime_state.h" -#include "gen_cpp/ImpalaInternalService_types.h" +#include "gen_cpp/PaloInternalService_types.h" #include "gen_cpp/Types_types.h" #include "gen_cpp/Exprs_types.h" #include "exec/sort_exec_exprs.h" diff --git a/be/test/runtime/string_buffer_test.cpp b/be/test/runtime/string_buffer_test.cpp index d727d391f7..9e32adddaa 100644 --- a/be/test/runtime/string_buffer_test.cpp +++ b/be/test/runtime/string_buffer_test.cpp @@ -13,12 +13,13 @@ // specific language governing permissions and limitations // under the License. +#include "runtime/string_buffer.hpp" + #include #include #include "runtime/mem_pool.h" -#include "runtime/string_buffer.hpp" - +#include "runtime/mem_tracker.h" namespace palo { @@ -32,7 +33,8 @@ void validate_string(const std::string& std_str, const StringBuffer& str) { } TEST(StringBufferTest, Basic) { - MemPool pool; + MemTracker tracker; + MemPool pool(&tracker); StringBuffer str(&pool); std::string std_str; @@ -71,12 +73,14 @@ TEST(StringBufferTest, Basic) { } int main(int argc, char** argv) { +#if 0 std::string conffile = std::string(getenv("PALO_HOME")) + "/conf/be.conf"; if (!palo::config::init(conffile.c_str(), false)) { fprintf(stderr, "error read config file. \n"); return -1; } init_glog("be-test"); +#endif ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } diff --git a/be/test/runtime/string_value_test.cpp b/be/test/runtime/string_value_test.cpp index 4252380215..6055d1cbc4 100644 --- a/be/test/runtime/string_value_test.cpp +++ b/be/test/runtime/string_value_test.cpp @@ -13,12 +13,15 @@ // specific language governing permissions and limitations // under the License. +#include "runtime/string_value.hpp" + #include #include -#include "runtime/string_value.hpp" #include "util/cpu_info.h" +using std::string; + namespace palo { StringValue FromStdString(const string& str) { @@ -79,14 +82,17 @@ TEST(StringValueTest, TestCompare) { } int main(int argc, char** argv) { +#if 0 std::string conffile = std::string(getenv("PALO_HOME")) + "/conf/be.conf"; if (!palo::config::init(conffile.c_str(), false)) { fprintf(stderr, "error read config file. \n"); return -1; } init_glog("be-test"); - ::testing::InitGoogleTest(&argc, argv); palo::CpuInfo::Init(); +#endif + ::testing::InitGoogleTest(&argc, argv); + palo::CpuInfo::init(); return RUN_ALL_TESTS(); } diff --git a/be/test/runtime/tmp_file_mgr_test.cpp b/be/test/runtime/tmp_file_mgr_test.cpp index 3f23347822..ad1c5fdbae 100644 --- a/be/test/runtime/tmp_file_mgr_test.cpp +++ b/be/test/runtime/tmp_file_mgr_test.cpp @@ -38,13 +38,13 @@ namespace palo { class TmpFileMgrTest : public ::testing::Test { protected: virtual void SetUp() { - _metrics.reset(new MetricGroup("")); + _metrics.reset(new MetricRegistry("")); } virtual void TearDown() { _metrics.reset(); } - +#if 0 // Check that metric values are consistent with TmpFileMgr state. void check_metrics(TmpFileMgr* tmp_file_mgr) { vector active = tmp_file_mgr->active_tmp_devices(); @@ -60,8 +60,8 @@ protected: EXPECT_TRUE(active_set.find(tmp_dir_path) != active_set.end()); } } - - boost::scoped_ptr _metrics; +#endif + boost::scoped_ptr _metrics; }; // Regression test for IMPALA-2160. Verify that temporary file manager allocates blocks @@ -96,7 +96,7 @@ TEST_F(TmpFileMgrTest, TestFileAllocation) { status = file->remove(); EXPECT_TRUE(status.ok()); EXPECT_FALSE(boost::filesystem::exists(file->path())); - check_metrics(&tmp_file_mgr); + // check_metrics(&tmp_file_mgr); } // Test that we can do initialization with two directories on same device and // that validations prevents duplication of directories. @@ -120,7 +120,7 @@ TEST_F(TmpFileMgrTest, TestOneDirPerDevice) { // Check the prefix is the expected temporary directory. EXPECT_EQ(0, file->path().find(tmp_dirs[0])); FileSystemUtil::remove_paths(tmp_dirs); - check_metrics(&tmp_file_mgr); + // check_metrics(&tmp_file_mgr); } // Test that we can do custom initialization with two dirs on same device. @@ -147,7 +147,7 @@ TEST_F(TmpFileMgrTest, TestMultiDirsPerDevice) { EXPECT_EQ(0, file->path().find(tmp_dirs[i])); } FileSystemUtil::remove_paths(tmp_dirs); - check_metrics(&tmp_file_mgr); + // check_metrics(&tmp_file_mgr); } // Test that reporting a write error is possible but does not result in @@ -165,7 +165,7 @@ TEST_F(TmpFileMgrTest, TestReportError) { // Both directories should be used. vector devices = tmp_file_mgr.active_tmp_devices(); EXPECT_EQ(2, devices.size()); - check_metrics(&tmp_file_mgr); + // check_metrics(&tmp_file_mgr); // Inject an error on one device so that we can validate it is handled correctly. TUniqueId id; @@ -183,7 +183,7 @@ TEST_F(TmpFileMgrTest, TestReportError) { EXPECT_EQ(2, tmp_file_mgr.num_active_tmp_devices()); vector devices_after = tmp_file_mgr.active_tmp_devices(); EXPECT_EQ(2, devices_after.size()); - check_metrics(&tmp_file_mgr); + // check_metrics(&tmp_file_mgr); // Attempts to expand bad file should succeed. int64_t offset; @@ -197,7 +197,7 @@ TEST_F(TmpFileMgrTest, TestReportError) { // Attempts to allocate new files on bad device should succeed. EXPECT_TRUE(tmp_file_mgr.get_file(devices[bad_device], id, &bad_file).ok()); FileSystemUtil::remove_paths(tmp_dirs); - check_metrics(&tmp_file_mgr); + // check_metrics(&tmp_file_mgr); } TEST_F(TmpFileMgrTest, TestAllocateFails) { diff --git a/be/test/util/CMakeLists.txt b/be/test/util/CMakeLists.txt index 7d423e9ea4..6f9e54a10e 100644 --- a/be/test/util/CMakeLists.txt +++ b/be/test/util/CMakeLists.txt @@ -9,7 +9,7 @@ # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE_2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an @@ -21,20 +21,17 @@ # where to put generated libraries set(EXECUTABLE_OUTPUT_PATH "${BUILD_DIR}/test/util") -#ADD_BE_TEST(integer-array-test) -#ADD_BE_TEST(runtime_profile_test) -#ADD_BE_TEST(benchmark-test) -#ADD_BE_TEST(decompress-test) -#ADD_BE_TEST(metrics-test) -#ADD_BE_TEST(debug-util-test) -#ADD_BE_TEST(url-coding-test) -#ADD_BE_TEST(thrift-util-test) -#ADD_BE_TEST(bit-util-test) -#ADD_BE_TEST(rle-test) -##ADD_BE_TEST(perf-counters-test) +ADD_BE_TEST(bit_util_test) +ADD_BE_TEST(brpc_stub_cache_test) ADD_BE_TEST(path_trie_test) ADD_BE_TEST(count_down_latch_test) ADD_BE_TEST(lru_cache_util_test) ADD_BE_TEST(filesystem_util_test) ADD_BE_TEST(internal_queue_test) ADD_BE_TEST(cidr_test) +ADD_BE_TEST(new_metrics_test) +ADD_BE_TEST(palo_metrics_test) +ADD_BE_TEST(system_metrics_test) +ADD_BE_TEST(core_local_test) +ADD_BE_TEST(types_test) +ADD_BE_TEST(rpc_channel_test) diff --git a/be/test/util/bit_util_test.cpp b/be/test/util/bit_util_test.cpp index 497ee6ec26..9e6a8977ed 100644 --- a/be/test/util/bit_util_test.cpp +++ b/be/test/util/bit_util_test.cpp @@ -19,8 +19,10 @@ #include #include +#include "common/config.h" #include "util/bit_util.h" #include "util/cpu_info.h" +#include "util/logging.h" namespace palo { @@ -53,7 +55,7 @@ int main(int argc, char** argv) { fprintf(stderr, "error read config file. \n"); return -1; } - init_glog("be-test"); + palo::init_glog("be-test"); ::testing::InitGoogleTest(&argc, argv); palo::CpuInfo::init(); return RUN_ALL_TESTS(); diff --git a/be/test/util/brpc_stub_cache_test.cpp b/be/test/util/brpc_stub_cache_test.cpp new file mode 100644 index 0000000000..0268ce4101 --- /dev/null +++ b/be/test/util/brpc_stub_cache_test.cpp @@ -0,0 +1,59 @@ +// Copyright (c) 2018, Baidu.com, Inc. All Rights Reserved + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "util/brpc_stub_cache.h" + +#include + +namespace palo { + +class BrpcStubCacheTest : public testing::Test { +public: + BrpcStubCacheTest() { } + virtual ~BrpcStubCacheTest() { + } +}; + +TEST_F(BrpcStubCacheTest, normal) { + BrpcStubCache cache; + TNetworkAddress address; + address.hostname = "127.0.0.1"; + address.port = 123; + auto stub1 = cache.get_stub(address); + ASSERT_NE(nullptr, stub1); + address.port = 124; + auto stub2 = cache.get_stub(address); + ASSERT_NE(nullptr, stub2); + ASSERT_NE(stub1, stub2); + address.port = 123; + auto stub3 = cache.get_stub(address); + ASSERT_EQ(stub1, stub3); +} + +TEST_F(BrpcStubCacheTest, invalid) { + BrpcStubCache cache; + TNetworkAddress address; + address.hostname = "invalid.cm.invalid"; + address.port = 123; + auto stub1 = cache.get_stub(address); + ASSERT_EQ(nullptr, stub1); +} + +} + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/be/test/util/core_local_test.cpp b/be/test/util/core_local_test.cpp new file mode 100644 index 0000000000..702bd77c7d --- /dev/null +++ b/be/test/util/core_local_test.cpp @@ -0,0 +1,122 @@ +// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "util/core_local.h" + +#include + +#include +#include + +#include "common/logging.h" +#include "util/stopwatch.hpp" +#include "time.h" + +namespace palo { + +// Fixture for testing class Decompressor +class CoreLocalTest : public ::testing::Test { +protected: + CoreLocalTest() { + } + ~CoreLocalTest() { + } +}; + +void updater(CoreLocalValue* value, int64_t* used_ns) { + sleep(1); + MonotonicStopWatch stopwatch; + stopwatch.start(); + for (int i = 0; i < 1000000L; ++i) { + __sync_fetch_and_add(value->access(), 1); + } + *used_ns = stopwatch.elapsed_time(); +} + +TEST_F(CoreLocalTest, CoreLocalValue) { + CoreLocalValue value; + std::vector used_ns; + used_ns.resize(8); + std::vector workers; + for (int i = 0; i < 8; ++i) { + workers.emplace_back(updater, &value, &used_ns[i]); + } + int64_t sum_ns = 0; + for (int i = 0; i < 8; ++i) { + workers[i].join(); + sum_ns += used_ns[i]; + } + int64_t sum = 0; + for (int i = 0; i < value.size(); ++i) { + sum += __sync_fetch_and_add(value.access_at_core(i), 0); + } + ASSERT_EQ(8 * 1000000L, sum); + LOG(INFO) << "time:" << sum_ns / sum << "ns/op"; +} + +TEST_F(CoreLocalTest, CoreDataAllocator) { + CoreDataAllocatorFactory factory; + auto allocator1 = factory.get_allocator(1, 8); + auto ptr = allocator1->get_or_create(0); + ASSERT_TRUE(ptr != nullptr); + { + auto ptr2 = allocator1->get_or_create(0); + ASSERT_TRUE(ptr == ptr2); + } + { + auto ptr2 = allocator1->get_or_create(4096); + ASSERT_TRUE(ptr2 != nullptr); + } + { + auto allocator2 = factory.get_allocator(2, 8); + ASSERT_TRUE(allocator2 != allocator1); + } +} + +TEST_F(CoreLocalTest, CoreLocalValueController) { + CoreLocalValueController controller; + auto id = controller.get_id(); + ASSERT_EQ(0, id); + controller.reclaim_id(id); + id = controller.get_id(); + ASSERT_EQ(0, id); + id = controller.get_id(); + ASSERT_EQ(1, id); +} + +TEST_F(CoreLocalTest, CoreLocalValueNormal) { + CoreLocalValue value; + for (int i = 0; i < value.size(); ++i) { + ASSERT_EQ(0, *value.access_at_core(i)); + *value.access_at_core(i) += 1; + } + for (int i = 0; i < value.size(); ++i) { + ASSERT_EQ(1, *value.access_at_core(i)); + } + for (int i = 0; i < 10000; ++i) { + *value.access() += 1; + } + int64_t sum = 0; + for (int i = 0; i < value.size(); ++i) { + sum += *value.access_at_core(i); + } + ASSERT_EQ(10000 + std::thread::hardware_concurrency(), sum); +} +} + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/be/test/util/count_down_latch_test.cpp b/be/test/util/count_down_latch_test.cpp index e864964394..fd29c9a666 100644 --- a/be/test/util/count_down_latch_test.cpp +++ b/be/test/util/count_down_latch_test.cpp @@ -14,7 +14,9 @@ // under the License. #include +#include "common/config.h" #include "util/count_down_latch.hpp" +#include "util/logging.h" #include @@ -87,6 +89,12 @@ TEST_F(CountDownLatchTest, Timeout) { } int main(int argc, char** argv) { + std::string conffile = std::string(getenv("PALO_HOME")) + "/conf/be.conf"; + if (!palo::config::init(conffile.c_str(), false)) { + fprintf(stderr, "error read config file. \n"); + return -1; + } + palo::init_glog("be-test"); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } diff --git a/be/test/util/debug_util_test.cpp b/be/test/util/debug_util_test.cpp deleted file mode 100644 index 51797d5c20..0000000000 --- a/be/test/util/debug_util_test.cpp +++ /dev/null @@ -1,77 +0,0 @@ -// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved - -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include -#include - -#include -#include "util/debug-util.h" - -using namespace std; - -namespace impala { - -string RecursionStack(int level) { - if (level == 0) { - return GetStackTrace(); - } - - return RecursionStack(level - 1); -} - -TEST(DebugUtil, StackDump) { - cout << "Stack: " << endl << GetStackTrace() << endl; - cout << "Stack Recursion: " << endl << RecursionStack(5) << endl; -} - -TEST(DebugUtil, QueryIdParsing) { - TUniqueId id; - EXPECT_FALSE(ParseId("abcd", &id)); - EXPECT_FALSE(ParseId("abcdabcdabcdabcdabcdabcdabcdabcda", &id)); - EXPECT_FALSE(ParseId("zbcdabcdabcdabcd:abcdabcdabcdabcd", &id)); - EXPECT_FALSE(ParseId("~bcdabcdabcdabcd:abcdabcdabcdabcd", &id)); - EXPECT_FALSE(ParseId("abcdabcdabcdabcd:!bcdabcdabcdabcd", &id)); - - EXPECT_TRUE(ParseId("abcdabcdabcdabcd:abcdabcdabcdabcd", &id)); - EXPECT_EQ(id.hi, 0xabcdabcdabcdabcd); - EXPECT_EQ(id.lo, 0xabcdabcdabcdabcd); - - EXPECT_TRUE(ParseId("abcdabcdabcdabcd:1234abcdabcd5678", &id)); - EXPECT_EQ(id.hi, 0xabcdabcdabcdabcd); - EXPECT_EQ(id.lo, 0x1234abcdabcd5678); - - EXPECT_TRUE(ParseId("cdabcdabcdabcd:1234abcdabcd5678", &id)); - EXPECT_EQ(id.hi, 0xcdabcdabcdabcd); - EXPECT_EQ(id.lo, 0x1234abcdabcd5678); - - EXPECT_TRUE(ParseId("cdabcdabcdabcd:abcdabcd5678", &id)); - EXPECT_EQ(id.hi, 0xcdabcdabcdabcd); - EXPECT_EQ(id.lo, 0xabcdabcd5678); -} - -} - -int main(int argc, char** argv) { - std::string conffile = std::string(getenv("PALO_HOME")) + "/conf/be.conf"; - if (!palo::config::init(conffile.c_str(), false)) { - fprintf(stderr, "error read config file. \n"); - return -1; - } - init_glog("be-test"); - ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} - diff --git a/be/test/util/decompress_test.cpp b/be/test/util/decompress_test.cpp index ae2d06b42f..4a90279fec 100644 --- a/be/test/util/decompress_test.cpp +++ b/be/test/util/decompress_test.cpp @@ -19,7 +19,7 @@ #include #include "util/decompress.h" #include "util/compress.h" -#include "gen-cpp/Descriptors_types.h" +#include "gen_cpp/Descriptors_types.h" using namespace std; using namespace boost; diff --git a/be/test/util/internal_queue_test.cpp b/be/test/util/internal_queue_test.cpp index d3c573e192..2d836e2892 100644 --- a/be/test/util/internal_queue_test.cpp +++ b/be/test/util/internal_queue_test.cpp @@ -314,11 +314,11 @@ TEST(InternalQueue, TestMultiProducerMultiConsumer) { } // end namespace palo int main(int argc, char** argv) { - // std::string conffile = std::string(getenv("PALO_HOME")) + "/conf/be.conf"; - // if (!palo::config::init(conffile.c_str(), false)) { - // fprintf(stderr, "error read config file. \n"); - // return -1; - // } + std::string conffile = std::string(getenv("PALO_HOME")) + "/conf/be.conf"; + if (!palo::config::init(conffile.c_str(), false)) { + fprintf(stderr, "error read config file. \n"); + return -1; + } palo::init_glog("be-test"); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/be/test/util/lru_cache_util_test.cpp b/be/test/util/lru_cache_util_test.cpp index 09604389ef..29c1f9de29 100644 --- a/be/test/util/lru_cache_util_test.cpp +++ b/be/test/util/lru_cache_util_test.cpp @@ -13,11 +13,13 @@ // specific language governing permissions and limitations // under the License. -#include "util/lru_cache.hpp" - #include #include +#include "common/config.h" +#include "util/logging.h" +#include "util/lru_cache.hpp" + namespace palo { class LruCacheTest : public testing::Test { @@ -89,6 +91,12 @@ TEST_F(LruCacheTest, OverSize) { } int main(int argc, char** argv) { + std::string conffile = std::string(getenv("PALO_HOME")) + "/conf/be.conf"; + if (!palo::config::init(conffile.c_str(), false)) { + fprintf(stderr, "error read config file. \n"); + return -1; + } + palo::init_glog("be-test"); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } diff --git a/be/test/util/metrics_test.cpp b/be/test/util/metrics_test.cpp deleted file mode 100644 index 32913f50bf..0000000000 --- a/be/test/util/metrics_test.cpp +++ /dev/null @@ -1,151 +0,0 @@ -// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved - -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "util/metrics.h" -#include "util/non-primitive-metrics.h" -#include -#include - -namespace palo { - -class MetricsTest : public testing::Test { -public: - Metrics* metrics() { - return _metrics.get(); - } - ~MetricsTest() { - } - MetricsTest() : _metrics(new Metrics()) { - _bool_metric = _metrics->create_and_register_primitive_metric("bool", false); - _int_metric = _metrics->create_and_register_primitive_metric("int", 0L); - _double_metric = _metrics->create_and_register_primitive_metric("double", - 1.23); - _string_metric = _metrics->create_and_register_primitive_metric("string", - string("hello world")); - - vector items; - items.push_back(1); - items.push_back(2); - items.push_back(3); - _list_metric = _metrics->register_metric(new ListMetric("list", items)); - set item_set; - item_set.insert(4); - item_set.insert(5); - item_set.insert(6); - _set_metric = _metrics->register_metric(new SetMetric("set", item_set)); - - set string_set; - string_set.insert("one"); - string_set.insert("two"); - _string_set_metric = _metrics->register_metric(new SetMetric("string_set", - string_set)); - } -private: - Metrics::BooleanMetric* _bool_metric; - Metrics::IntMetric* _int_metric; - Metrics::DoubleMetric* _double_metric; - Metrics::StringMetric* _string_metric; - - ListMetric* _list_metric; - SetMetric* _set_metric; - SetMetric* _string_set_metric; // For quote testing - - boost::scoped_ptr _metrics; -}; - -TEST_F(MetricsTest, IntMetrics) { - EXPECT_NE(metrics()->DebugString().find("int:0"), string::npos); - _int_metric->update(3); - EXPECT_NE(metrics()->DebugString().find("int:3"), string::npos); -} - -TEST_F(MetricsTest, DoubleMetrics) { - EXPECT_NE(metrics()->DebugString().find("double:1.23"), string::npos); - _double_metric->update(2.34); - EXPECT_NE(metrics()->DebugString().find("double:2.34"), string::npos); -} - -TEST_F(MetricsTest, StringMetrics) { - EXPECT_NE(metrics()->DebugString().find("string:hello world"), string::npos); - _string_metric->update("foo bar"); - EXPECT_NE(metrics()->DebugString().find("string:foo bar"), string::npos); -} - -TEST_F(MetricsTest, BooleanMetrics) { - EXPECT_NE(metrics()->DebugString().find("bool:0"), string::npos); - _bool_metric->update(true); - EXPECT_NE(metrics()->DebugString().find("bool:1"), string::npos); -} - -TEST_F(MetricsTest, ListMetrics) { - EXPECT_NE(metrics()->DebugString().find("list:[1, 2, 3]"), string::npos); - _list_metric->update(vector()); - EXPECT_NE(metrics()->DebugString().find("list:[]"), string::npos); -} - -TEST_F(MetricsTest, SetMetrics) { - EXPECT_NE(metrics()->DebugString().find("set:[4, 5, 6]"), string::npos); - _set_metric->Add(7); - _set_metric->Add(7); - _set_metric->Remove(4); - _set_metric->Remove(4); - EXPECT_NE(metrics()->DebugString().find("set:[5, 6, 7]"), string::npos); -} - -TEST_F(MetricsTest, TestAndSet) { - _int_metric->update(1); - // Expect update to fail - EXPECT_EQ(_int_metric->TestAndSet(5, 0), 1); - EXPECT_EQ(_int_metric->value(), 1); - - // Successful update - EXPECT_EQ(_int_metric->TestAndSet(5, 1), 1); - EXPECT_EQ(_int_metric->value(), 5); -} - -TEST_F(MetricsTest, increment) { - _int_metric->update(1); - EXPECT_EQ(_int_metric->increment(10), 11); - EXPECT_EQ(_int_metric->value(), 11); -} - -TEST_F(MetricsTest, JsonQuoting) { - // Strings should be quoted in Json output - EXPECT_NE(metrics()->debug_string_json().find("\"string\": \"hello world\""), - string::npos); - - // Other types should not be quoted - EXPECT_NE(metrics()->debug_string_json().find("\"bool\": 0"), string::npos); - - // Strings in sets should be quoted - EXPECT_NE(metrics()->debug_string_json().find("\"string_set\": [\"one\", \"two\"]"), - string::npos); - - // Other types in sets should not be quoted - EXPECT_NE(metrics()->debug_string_json().find("\"set\": [4, 5, 6]"), string::npos); -} -} - -int main(int argc, char** argv) { - std::string conffile = std::string(getenv("PALO_HOME")) + "/conf/be.conf"; - if (!palo::config::init(conffile.c_str(), false)) { - fprintf(stderr, "error read config file. \n"); - return -1; - } - init_glog("be-test"); - google::InitGoogleLogging(argv[0]); - ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} diff --git a/be/test/util/mysql_row_buffer_test.cpp b/be/test/util/mysql_row_buffer_test.cpp deleted file mode 100644 index f619f5a217..0000000000 --- a/be/test/util/mysql_row_buffer_test.cpp +++ /dev/null @@ -1,154 +0,0 @@ -// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved - -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include - -#include "util/mysql_row_buffer.h" - -using namespace std; - -namespace palo { - -class MysqlRowBufferTest : public testing::Test { -public: - MysqlRowBufferTest() { - } - -protected: - virtual void SetUp() { - } -}; - -TEST_F(MysqlRowBufferTest, tinyint) { - MysqlRowBuffer buffer; - - ASSERT_EQ(0, buffer.PushTinyInt(-111)); - ASSERT_EQ(4, *(int8_t*)buffer.buf()); - ASSERT_STREQ("-111", buffer.buf() + 1); - - buffer.Reset(); - ASSERT_EQ(0, buffer.PushTinyInt(100)); - ASSERT_EQ(3, *(int8_t*)buffer.buf()); - ASSERT_STREQ("100", buffer.buf() + 1); - - buffer.Reset(); - ASSERT_EQ(0, buffer.PushTinyInt(255)); - ASSERT_EQ(2, *(int8_t*)buffer.buf()); - ASSERT_STREQ("-1", buffer.buf() + 1); -} - -TEST_F(MysqlRowBufferTest, smallint) { - MysqlRowBuffer buffer; - - ASSERT_EQ(0, buffer.PushSmallInt(-10000)); - ASSERT_EQ(6, *(int8_t*)buffer.buf()); - ASSERT_STREQ("-10000", buffer.buf() + 1); - - buffer.Reset(); - ASSERT_EQ(0, buffer.PushSmallInt(32767)); - ASSERT_EQ(5, *(int8_t*)buffer.buf()); - ASSERT_STREQ("32767", buffer.buf() + 1); - - buffer.Reset(); - ASSERT_EQ(0, buffer.PushSmallInt(65535)); - ASSERT_EQ(2, *(int8_t*)buffer.buf()); - ASSERT_STREQ("-1", buffer.buf() + 1); -} - -TEST_F(MysqlRowBufferTest, int) { - MysqlRowBuffer buffer; - - ASSERT_EQ(0, buffer.PushInt(-10000)); - ASSERT_EQ(6, *(int8_t*)buffer.buf()); - ASSERT_STREQ("-10000", buffer.buf() + 1); - - buffer.Reset(); - ASSERT_EQ(0, buffer.PushInt(32767)); - ASSERT_EQ(5, *(int8_t*)buffer.buf()); - ASSERT_STREQ("32767", buffer.buf() + 1); - - buffer.Reset(); - ASSERT_EQ(0, buffer.PushInt(4294967295)); - ASSERT_EQ(2, *(int8_t*)buffer.buf()); - ASSERT_STREQ("-1", buffer.buf() + 1); -} -TEST_F(MysqlRowBufferTest, bigint) { - MysqlRowBuffer buffer; - - ASSERT_EQ(0, buffer.PushBigInt(-1000000000)); - ASSERT_EQ(11, *(int8_t*)buffer.buf()); - ASSERT_STREQ("-1000000000", buffer.buf() + 1); - - buffer.Reset(); - ASSERT_EQ(0, buffer.PushBigInt(1000032767)); - ASSERT_EQ(10, *(int8_t*)buffer.buf()); - ASSERT_STREQ("1000032767", buffer.buf() + 1); -} -TEST_F(MysqlRowBufferTest, float) { - MysqlRowBuffer buffer; - - ASSERT_EQ(0, buffer.PushFloat(-1.1)); - ASSERT_EQ(4, *(int8_t*)buffer.buf()); - ASSERT_STREQ("-1.1", buffer.buf() + 1); - - buffer.Reset(); - ASSERT_EQ(0, buffer.PushFloat(1000.12)); - ASSERT_EQ(7, *(int8_t*)buffer.buf()); - ASSERT_STREQ("1000.12", buffer.buf() + 1); -} -TEST_F(MysqlRowBufferTest, double) { - MysqlRowBuffer buffer; - - ASSERT_EQ(0, buffer.PushDouble(-1.1)); - ASSERT_EQ(4, *(int8_t*)buffer.buf()); - ASSERT_STREQ("-1.1", buffer.buf() + 1); - - buffer.Reset(); - ASSERT_EQ(0, buffer.PushDouble(1000.001)); - ASSERT_EQ(8, *(int8_t*)buffer.buf()); - ASSERT_STREQ("1000.001", buffer.buf() + 1); -} - -TEST_F(MysqlRowBufferTest, string) { - MysqlRowBuffer buffer; - - ASSERT_EQ(0, buffer.PushString("hello", 6)); - ASSERT_EQ(6, *(int8_t*)buffer.buf()); - ASSERT_STREQ("hello", buffer.buf() + 1); - ASSERT_NE(0, buffer.PushString(NULL, 6)); -} - -TEST_F(MysqlRowBufferTest, long_buffer) { - MysqlRowBuffer buffer; - - for (int i = 0; i < 5000; ++i) { - ASSERT_EQ(0, buffer.PushInt(10000)); - } - - ASSERT_EQ(30000, buffer.length()); -} - -} - -int main(int argc, char** argv) { - std::string conffile = std::string(getenv("PALO_HOME")) + "/conf/be.conf"; - if (!palo::config::init(conffile.c_str(), false)) { - fprintf(stderr, "error read config file. \n"); - return -1; - } - init_glog("be-test"); - ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} diff --git a/be/test/util/new_metrics_test.cpp b/be/test/util/new_metrics_test.cpp new file mode 100644 index 0000000000..3da3fd343a --- /dev/null +++ b/be/test/util/new_metrics_test.cpp @@ -0,0 +1,295 @@ +// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include +#include + +#include "common/config.h" +#include "util/logging.h" +#include "util/metrics.h" +#include "util/stopwatch.hpp" + +namespace palo { + +class MetricsTest : public testing::Test { +public: + MetricsTest() { } + virtual ~MetricsTest() { + } +}; + +TEST_F(MetricsTest, Counter) { + { + IntCounter counter; + ASSERT_EQ(0, counter.value()); + counter.increment(100); + ASSERT_EQ(100, counter.value()); + + ASSERT_STREQ("100", counter.to_string().c_str()); + } + { + DoubleCounter counter; + ASSERT_EQ(0.0, counter.value()); + counter.increment(1.23); + ASSERT_EQ(1.23, counter.value()); + + ASSERT_STREQ("1.23", counter.to_string().c_str()); + } +} + +void mt_updater(IntCounter* counter, std::atomic* used_time) { + sleep(1); + MonotonicStopWatch watch; + watch.start(); + for (int i = 0; i < 1000000L; ++i) { + counter->increment(1); + } + uint64_t elapsed = watch.elapsed_time(); + used_time->fetch_add(elapsed); +} + +TEST_F(MetricsTest, CounterPerf) { + IntCounter counter; + volatile int64_t sum = 0; + + { + MonotonicStopWatch watch; + watch.start(); + for (int i = 0; i < 100000000; ++i) { + counter.increment(1); + } + uint64_t elapsed = watch.elapsed_time(); + LOG(INFO) << "counter elapsed: " << elapsed + << "ns, ns/iter:" << elapsed / 100000000; + } + { + MonotonicStopWatch watch; + watch.start(); + for (int i = 0; i < 100000000; ++i) { + sum += 1; + } + uint64_t elapsed = watch.elapsed_time(); + LOG(INFO) << "value elapsed: " << elapsed + << "ns, ns/iter:" << elapsed / 100000000; + } + ASSERT_EQ(100000000, counter.value()); + ASSERT_EQ(100000000, sum); + { + IntCounter mt_counter; + std::vector updaters; + std::atomic used_time(0); + for (int i = 0; i < 8; ++i) { + updaters.emplace_back(&mt_updater, &mt_counter, &used_time); + } + for (int i = 0; i < 8; ++i) { + updaters[i].join(); + } + LOG(INFO) << "mt_counter elapsed: " << used_time.load() + << "ns, ns/iter:" << used_time.load() / (8 * 1000000L); + ASSERT_EQ(8 * 1000000L, mt_counter.value()); + } +} + +TEST_F(MetricsTest, Gauge) { + { + IntGauge gauge; + ASSERT_EQ(0, gauge.value()); + gauge.set_value(100); + ASSERT_EQ(100, gauge.value()); + + ASSERT_STREQ("100", gauge.to_string().c_str()); + } + { + DoubleGauge gauge; + ASSERT_EQ(0.0, gauge.value()); + gauge.set_value(1.23); + ASSERT_EQ(1.23, gauge.value()); + + ASSERT_STREQ("1.23", gauge.to_string().c_str()); + } +} + +TEST_F(MetricsTest, MetricLabel) { + std::string put("put"); + MetricLabel label("type", put); + + ASSERT_TRUE(label == MetricLabel("type", "put")); + ASSERT_TRUE(label != MetricLabel("type", "get")); + ASSERT_TRUE(label < MetricLabel("type", "quit")); + ASSERT_TRUE(label < MetricLabel("typee", "put")); + ASSERT_TRUE(label.compare(MetricLabel("type", "put")) == 0); + ASSERT_TRUE(label.compare(MetricLabel("typee", "put")) < 0); + + ASSERT_STREQ("type=put", label.to_string().c_str()); +} + +TEST_F(MetricsTest, MetricLabels) { + MetricLabels empty_labels; + + ASSERT_TRUE(empty_labels == MetricLabels()); + ASSERT_TRUE(empty_labels < MetricLabels().add("type", "put")); + ASSERT_TRUE(empty_labels.empty()); + + ASSERT_STREQ("", empty_labels.to_string().c_str()); + + MetricLabels labels; + labels.add("path", "/home").add("type", "put"); + + ASSERT_TRUE(labels == MetricLabels().add("path", "/home").add("type", "put")); + ASSERT_FALSE(labels == MetricLabels().add("path", "/home").add("type", "get")); + ASSERT_FALSE(labels == MetricLabels().add("path", "/home")); + ASSERT_TRUE(labels < MetricLabels().add("path", "/sports")); + ASSERT_TRUE(labels < MetricLabels().add("path", "/home").add("type", "put").add("xstatus", "404")); + ASSERT_FALSE(labels < MetricLabels().add("path", "/abc")); + ASSERT_FALSE(labels < MetricLabels().add("path", "/home").add("type", "put")); + + ASSERT_STREQ("path=/home,type=put", labels.to_string().c_str()); +} + +class TestMetricsVisitor : public MetricsVisitor { +public: + virtual ~TestMetricsVisitor() { } + void visit(const std::string& prefix, const std::string& name, + MetricCollector* collector) { + for (auto& it : collector->metrics()) { + Metric* metric = it.second; + auto& labels = it.first; + switch (metric->type()) { + case MetricType::COUNTER: { + bool has_prev = false; + if (!prefix.empty()) { + _ss << prefix; + has_prev = true; + } + if (!name.empty()) { + if (has_prev) { + _ss << "_"; + } + _ss << name; + } + if (!labels.empty()) { + if (has_prev) { + _ss << "_"; + } + _ss << labels.to_string(); + } + _ss << " " << ((SimpleMetric*)metric)->to_string() << std::endl; + break; + } + default: + break; + } + } + } + std::string to_string() { + return _ss.str(); + } +private: + std::stringstream _ss; +}; + +TEST_F(MetricsTest, MetricCollector) { + IntCounter puts; + puts.increment(101); + IntCounter gets; + gets.increment(201); + MetricCollector collector; + ASSERT_TRUE(collector.add_metic(MetricLabels().add("type", "put"), &puts)); + ASSERT_TRUE(collector.add_metic(MetricLabels().add("type", "get"), &gets)); + ASSERT_FALSE(collector.add_metic(MetricLabels().add("type", "get"), &gets)); + + { + // Can't add different type to one collector + IntGauge post; + ASSERT_FALSE(collector.add_metic(MetricLabels().add("type", "post"), &post)); + } + + { + TestMetricsVisitor visitor; + collector.collect("", "", &visitor); + ASSERT_STREQ("type=get 201\ntype=put 101\n", visitor.to_string().c_str()); + } + collector.remove_metric(&puts); + { + TestMetricsVisitor visitor; + collector.collect("", "", &visitor); + ASSERT_STREQ("type=get 201\n", visitor.to_string().c_str()); + } + // test get_metric + ASSERT_TRUE(collector.get_metric(MetricLabels()) == nullptr); + ASSERT_TRUE(collector.get_metric(MetricLabels().add("type" ,"get")) != nullptr); + std::vector metrics; + collector.get_metrics(&metrics); + ASSERT_EQ(1, metrics.size()); +} + +TEST_F(MetricsTest, MetricRegistry) { + MetricRegistry registry("test"); + IntCounter cpu_idle; + cpu_idle.increment(12); + ASSERT_TRUE(registry.register_metric("cpu_idle", &cpu_idle)); + // registry failed + IntCounter dummy; + ASSERT_FALSE(registry.register_metric("cpu_idle", &dummy)); + IntCounter memory_usage; + memory_usage.increment(24); + ASSERT_TRUE(registry.register_metric("memory_usage", &memory_usage)); + { + TestMetricsVisitor visitor; + registry.collect(&visitor); + ASSERT_STREQ("test_cpu_idle 12\ntest_memory_usage 24\n", visitor.to_string().c_str()); + } + registry.deregister_metric(&memory_usage); + { + TestMetricsVisitor visitor; + registry.collect(&visitor); + ASSERT_STREQ("test_cpu_idle 12\n", visitor.to_string().c_str()); + } + // test get_metric + ASSERT_TRUE(registry.get_metric("cpu_idle") != nullptr); + ASSERT_TRUE(registry.get_metric("memory_usage") == nullptr); +} + +TEST_F(MetricsTest, MetricRegistry2) { + MetricRegistry registry("test"); + IntCounter cpu_idle; + cpu_idle.increment(12); + ASSERT_TRUE(registry.register_metric("cpu_idle", &cpu_idle)); + + { + // memory_usage will deregister after this block + IntCounter memory_usage; + memory_usage.increment(24); + ASSERT_TRUE(registry.register_metric("memory_usage", &memory_usage)); + TestMetricsVisitor visitor; + registry.collect(&visitor); + ASSERT_STREQ("test_cpu_idle 12\ntest_memory_usage 24\n", visitor.to_string().c_str()); + } + + { + TestMetricsVisitor visitor; + registry.collect(&visitor); + ASSERT_STREQ("test_cpu_idle 12\n", visitor.to_string().c_str()); + } +} + +} + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/be/test/util/palo_metrics_test.cpp b/be/test/util/palo_metrics_test.cpp new file mode 100644 index 0000000000..da4bb40756 --- /dev/null +++ b/be/test/util/palo_metrics_test.cpp @@ -0,0 +1,278 @@ +// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "common/config.h" +#include "util/logging.h" +#include "util/palo_metrics.h" + +namespace palo { + +class PaloMetricsTest : public testing::Test { +public: + PaloMetricsTest() { } + virtual ~PaloMetricsTest() { + } +}; + +class TestMetricsVisitor : public MetricsVisitor { +public: + virtual ~TestMetricsVisitor() { } + void visit(const std::string& prefix, const std::string& name, + MetricCollector* collector) { + for (auto& it : collector->metrics()) { + Metric* metric = it.second; + auto& labels = it.first; + switch (metric->type()) { + case MetricType::COUNTER: { + bool has_prev = false; + if (!prefix.empty()) { + _ss << prefix; + has_prev = true; + } + if (!name.empty()) { + if (has_prev) { + _ss << "_"; + } + _ss << name; + } + if (!labels.empty()) { + if (has_prev) { + _ss << "{"; + } + _ss << labels.to_string(); + if (has_prev) { + _ss << "}"; + } + } + _ss << " " << ((SimpleMetric*)metric)->to_string() << std::endl; + break; + } + default: + break; + } + } + } + std::string to_string() { + return _ss.str(); + } +private: + std::stringstream _ss; +}; + +TEST_F(PaloMetricsTest, Normal) { + TestMetricsVisitor visitor; + PaloMetrics::instance()->initialize("test"); + auto metrics = PaloMetrics::metrics(); + metrics->collect(&visitor); + LOG(INFO) << "\n" << visitor.to_string(); + // check metric + { + PaloMetrics::fragment_requests_total.increment(12); + auto metric = metrics->get_metric("fragment_requests_total"); + ASSERT_TRUE(metric != nullptr); + ASSERT_STREQ("12", ((SimpleMetric*)metric)->to_string().c_str()); + } + { + PaloMetrics::fragment_request_duration_us.increment(101); + auto metric = metrics->get_metric("fragment_request_duration_us"); + ASSERT_TRUE(metric != nullptr); + ASSERT_STREQ("101", ((SimpleMetric*)metric)->to_string().c_str()); + } + { + PaloMetrics::http_requests_total.increment(102); + auto metric = metrics->get_metric("http_requests_total"); + ASSERT_TRUE(metric != nullptr); + ASSERT_STREQ("102", ((SimpleMetric*)metric)->to_string().c_str()); + } + { + PaloMetrics::http_request_duration_us.increment(103); + auto metric = metrics->get_metric("http_request_duration_us"); + ASSERT_TRUE(metric != nullptr); + ASSERT_STREQ("103", ((SimpleMetric*)metric)->to_string().c_str()); + } + { + PaloMetrics::http_request_send_bytes.increment(104); + auto metric = metrics->get_metric("http_request_send_bytes"); + ASSERT_TRUE(metric != nullptr); + ASSERT_STREQ("104", ((SimpleMetric*)metric)->to_string().c_str()); + } + { + PaloMetrics::query_scan_bytes.increment(104); + auto metric = metrics->get_metric("query_scan_bytes"); + ASSERT_TRUE(metric != nullptr); + ASSERT_STREQ("104", ((SimpleMetric*)metric)->to_string().c_str()); + } + { + PaloMetrics::query_scan_rows.increment(105); + auto metric = metrics->get_metric("query_scan_rows"); + ASSERT_TRUE(metric != nullptr); + ASSERT_STREQ("105", ((SimpleMetric*)metric)->to_string().c_str()); + } + { + PaloMetrics::ranges_processed_total.increment(13); + auto metric = metrics->get_metric("ranges_processed_total"); + ASSERT_TRUE(metric != nullptr); + ASSERT_STREQ("13", ((SimpleMetric*)metric)->to_string().c_str()); + } + { + PaloMetrics::push_requests_success_total.increment(106); + auto metric = metrics->get_metric("push_requests_total", + MetricLabels().add("status", "SUCCESS")); + ASSERT_TRUE(metric != nullptr); + ASSERT_STREQ("106", ((SimpleMetric*)metric)->to_string().c_str()); + } + { + PaloMetrics::push_requests_fail_total.increment(107); + auto metric = metrics->get_metric("push_requests_total", + MetricLabels().add("status", "FAIL")); + ASSERT_TRUE(metric != nullptr); + ASSERT_STREQ("107", ((SimpleMetric*)metric)->to_string().c_str()); + } + { + PaloMetrics::push_request_duration_us.increment(108); + auto metric = metrics->get_metric("push_request_duration_us"); + ASSERT_TRUE(metric != nullptr); + ASSERT_STREQ("108", ((SimpleMetric*)metric)->to_string().c_str()); + } + { + PaloMetrics::push_request_write_bytes.increment(109); + auto metric = metrics->get_metric("push_request_write_bytes"); + ASSERT_TRUE(metric != nullptr); + ASSERT_STREQ("109", ((SimpleMetric*)metric)->to_string().c_str()); + } + { + PaloMetrics::push_request_write_rows.increment(110); + auto metric = metrics->get_metric("push_request_write_rows"); + ASSERT_TRUE(metric != nullptr); + ASSERT_STREQ("110", ((SimpleMetric*)metric)->to_string().c_str()); + } + // engine request + { + PaloMetrics::create_tablet_requests_total.increment(15); + auto metric = metrics->get_metric("engine_requests_total", + MetricLabels().add("type", "create_tablet")); + ASSERT_TRUE(metric != nullptr); + ASSERT_STREQ("15", ((SimpleMetric*)metric)->to_string().c_str()); + } + { + PaloMetrics::drop_tablet_requests_total.increment(16); + auto metric = metrics->get_metric("engine_requests_total", + MetricLabels().add("type", "drop_tablet")); + ASSERT_TRUE(metric != nullptr); + ASSERT_STREQ("16", ((SimpleMetric*)metric)->to_string().c_str()); + } + { + PaloMetrics::report_all_tablets_requests_total.increment(17); + auto metric = metrics->get_metric("engine_requests_total", + MetricLabels().add("type", "report_all_tablets")); + ASSERT_TRUE(metric != nullptr); + ASSERT_STREQ("17", ((SimpleMetric*)metric)->to_string().c_str()); + } + { + PaloMetrics::report_tablet_requests_total.increment(18); + auto metric = metrics->get_metric("engine_requests_total", + MetricLabels().add("type", "report_tablet")); + ASSERT_TRUE(metric != nullptr); + ASSERT_STREQ("18", ((SimpleMetric*)metric)->to_string().c_str()); + } + { + PaloMetrics::schema_change_requests_total.increment(19); + auto metric = metrics->get_metric("engine_requests_total", + MetricLabels().add("type", "schema_change")); + ASSERT_TRUE(metric != nullptr); + ASSERT_STREQ("19", ((SimpleMetric*)metric)->to_string().c_str()); + } + { + PaloMetrics::create_rollup_requests_total.increment(20); + auto metric = metrics->get_metric("engine_requests_total", + MetricLabels().add("type", "create_rollup")); + ASSERT_TRUE(metric != nullptr); + ASSERT_STREQ("20", ((SimpleMetric*)metric)->to_string().c_str()); + } + { + PaloMetrics::storage_migrate_requests_total.increment(21); + auto metric = metrics->get_metric("engine_requests_total", + MetricLabels().add("type", "storage_migrate")); + ASSERT_TRUE(metric != nullptr); + ASSERT_STREQ("21", ((SimpleMetric*)metric)->to_string().c_str()); + } + { + PaloMetrics::delete_requests_total.increment(22); + auto metric = metrics->get_metric("engine_requests_total", + MetricLabels().add("type", "delete")); + ASSERT_TRUE(metric != nullptr); + ASSERT_STREQ("22", ((SimpleMetric*)metric)->to_string().c_str()); + } + { + PaloMetrics::cancel_delete_requests_total.increment(23); + auto metric = metrics->get_metric("engine_requests_total", + MetricLabels().add("type", "cancel_delete")); + ASSERT_TRUE(metric != nullptr); + ASSERT_STREQ("23", ((SimpleMetric*)metric)->to_string().c_str()); + } + // comapction + { + PaloMetrics::base_compaction_deltas_total.increment(30); + auto metric = metrics->get_metric("compaction_deltas_total", + MetricLabels().add("type", "base")); + ASSERT_TRUE(metric != nullptr); + ASSERT_STREQ("30", ((SimpleMetric*)metric)->to_string().c_str()); + } + { + PaloMetrics::cumulative_compaction_deltas_total.increment(31); + auto metric = metrics->get_metric("compaction_deltas_total", + MetricLabels().add("type", "cumulative")); + ASSERT_TRUE(metric != nullptr); + ASSERT_STREQ("31", ((SimpleMetric*)metric)->to_string().c_str()); + } + { + PaloMetrics::base_compaction_bytes_total.increment(32); + auto metric = metrics->get_metric("compaction_bytes_total", + MetricLabels().add("type", "base")); + ASSERT_TRUE(metric != nullptr); + ASSERT_STREQ("32", ((SimpleMetric*)metric)->to_string().c_str()); + } + { + PaloMetrics::cumulative_compaction_bytes_total.increment(33); + auto metric = metrics->get_metric("compaction_bytes_total", + MetricLabels().add("type", "cumulative")); + ASSERT_TRUE(metric != nullptr); + ASSERT_STREQ("33", ((SimpleMetric*)metric)->to_string().c_str()); + } + // Gauge + { + PaloMetrics::memory_pool_bytes_total.increment(40); + auto metric = metrics->get_metric("memory_pool_bytes_total"); + ASSERT_TRUE(metric != nullptr); + ASSERT_STREQ("40", ((SimpleMetric*)metric)->to_string().c_str()); + } +} + +} + +int main(int argc, char** argv) { +#if 0 + std::string conffile = std::string(getenv("PALO_HOME")) + "/conf/be.conf"; + if (!palo::config::init(conffile.c_str(), false)) { + fprintf(stderr, "error read config file. \n"); + return -1; + } + palo::init_glog("be-test"); +#endif + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/be/test/util/path_trie_test.cpp b/be/test/util/path_trie_test.cpp index 51add262e2..fe11606ad5 100644 --- a/be/test/util/path_trie_test.cpp +++ b/be/test/util/path_trie_test.cpp @@ -13,10 +13,12 @@ // specific language governing permissions and limitations // under the License. -#include "util/path_trie.hpp" - #include +#include "common/config.h" +#include "util/logging.h" +#include "util/path_trie.hpp" + namespace palo { class PathTrieTest : public testing::Test { @@ -166,6 +168,12 @@ TEST_F(PathTrieTest, EmptyTest) { } int main(int argc, char* argv[]) { + std::string conffile = std::string(getenv("PALO_HOME")) + "/conf/be.conf"; + if (!palo::config::init(conffile.c_str(), false)) { + fprintf(stderr, "error read config file. \n"); + return -1; + } + palo::init_glog("be-test"); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } diff --git a/be/test/util/perf_counters_test.cpp b/be/test/util/perf_counters_test.cpp index 3c79cd519d..5881d3a840 100644 --- a/be/test/util/perf_counters_test.cpp +++ b/be/test/util/perf_counters_test.cpp @@ -17,10 +17,10 @@ #include #include #include -#include "util/cpu-info.h" -#include "util/disk-info.h" -#include "util/mem-info.h" -#include "util/perf-counters.h" +#include "util/cpu_info.h" +#include "util/disk_info.h" +#include "util/mem_info.h" +#include "util/perf_counters.h" using namespace std; diff --git a/be/test/util/rpc_channel_test.cpp b/be/test/util/rpc_channel_test.cpp new file mode 100644 index 0000000000..8723f7da5a --- /dev/null +++ b/be/test/util/rpc_channel_test.cpp @@ -0,0 +1,185 @@ +// Copyright (c) 2018, Baidu.com, Inc. All Rights Reserved + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "util/rpc_channel.h" + +#include + +#include "rpc/connection_handler_factory.h" +#include "rpc/connection_manager.h" +#include "rpc/dispatch_handler.h" +#include "rpc/error.h" +#include "rpc/event.h" +#include "rpc/reactor_factory.h" + +namespace palo { + +class TestFactory : public ConnectionHandlerFactory { +public: + TestFactory(DispatchHandlerPtr handler) : _handler(handler) { } + virtual ~TestFactory() { } + void get_instance(DispatchHandlerPtr &dhp) override { + dhp = _handler; + } +private: + DispatchHandlerPtr _handler; +}; + +class TestDispacher : public DispatchHandler { +public: + TestDispacher(Comm* comm) : _comm(comm) { } + virtual ~TestDispacher() { } + + void handle(EventPtr& event) override { + if (event->type == Event::CONNECTION_ESTABLISHED) { + LOG(INFO) << "Connection Established."; + } else if (event->type == Event::DISCONNECT) { + if (event->error != 0) { + LOG(INFO) << "Disconnect : " << error::get_text(event->error); + } else { + LOG(INFO) << "Disconnect"; + } + } else if (event->type == Event::ERROR) { + LOG(ERROR) << "Error: " << error::get_text(event->error); + } else if (event->type == Event::MESSAGE) { + const uint8_t *buf_ptr = (const uint8_t*)event->payload; + if (buf_ptr[0] == 123) { + // ignore this packet + return; + } + CommHeader header; + header.initialize_from_request_header(event->header); + CommBufPtr response(new CommBuf(header, event->payload_len)); + response->append_bytes(event->payload, event->payload_len); + int error = _comm->send_response(event->addr, response); + if (error != error::OK) { + LOG(ERROR) << "Comm::send_response returned" << error::get_text(error); + } + } + } +private: + Comm* _comm; +}; + +class RpcChannelTest : public testing::Test { +public: + RpcChannelTest() { } + virtual ~RpcChannelTest() { + } + static void SetUpTestCase() { + ReactorFactory::initialize(1); + _comm.reset(new Comm("127.0.0.1")); + _conn_mgr = std::make_shared(_comm.get()); + DispatchHandlerPtr dhp = std::make_shared(_comm.get()); + ConnectionHandlerFactoryPtr factory = std::make_shared(dhp); + + struct sockaddr_in addr; + InetAddr::initialize(&addr, "127.0.0.1", 25437); + _comm->listen(addr, factory, dhp); + } + void SetUp() override { } + +private: + static std::unique_ptr _comm; + static ConnectionManagerPtr _conn_mgr; +}; + +std::unique_ptr RpcChannelTest::_comm; +ConnectionManagerPtr RpcChannelTest::_conn_mgr; + +TEST_F(RpcChannelTest, normal) { + RpcChannelPtr channel = std::make_shared(_comm.get(), _conn_mgr, 0); + auto st = channel->init("127.0.0.1", 25437, 500, 1000); + ASSERT_TRUE(st.ok()); + + uint8_t buf[16]; + for (int i = 0; i < 16; ++i) { + buf[i] = i; + } + st = channel->send_message(buf, 16); + ASSERT_TRUE(st.ok()); + const uint8_t* rep_buf = nullptr; + uint32_t rep_size = 0; + st = channel->get_response(&rep_buf, &rep_size); + ASSERT_TRUE(st.ok()); + ASSERT_EQ(16, rep_size); + for (int i = 0; i < rep_size; ++i) { + ASSERT_EQ(i, rep_buf[i]); + } +} + +TEST_F(RpcChannelTest, send_fail) { + RpcChannelPtr channel = std::make_shared(_comm.get(), _conn_mgr, 0); + auto st = channel->init("127.0.0.1", 25437, 100, 100); + ASSERT_TRUE(st.ok()); + + uint8_t buf[16]; + memset(buf, 0, 16); + // make reponse ignore this packet + buf[0] = 123; + st = channel->send_message(buf, 16); + ASSERT_TRUE(st.ok()); + const uint8_t* rep_buf = nullptr; + uint32_t rep_size = 0; + st = channel->get_response(&rep_buf, &rep_size); + ASSERT_FALSE(st.ok()); +} + +TEST_F(RpcChannelTest, disconnect) { + RpcChannelPtr channel = std::make_shared(_comm.get(), _conn_mgr, 0); + auto st = channel->init("127.0.0.1", 25437, 100, 100); + ASSERT_TRUE(st.ok()); + + EventPtr event = std::make_shared(Event::DISCONNECT); + channel->handle(event); + uint8_t buf[16]; + for (int i = 0; i < 16; ++i) { + buf[i] = i; + } + st = channel->send_message(buf, 16); + ASSERT_TRUE(st.ok()); + const uint8_t* rep_buf = nullptr; + uint32_t rep_size = 0; + st = channel->get_response(&rep_buf, &rep_size); + ASSERT_TRUE(st.ok()); + ASSERT_EQ(16, rep_size); + for (int i = 0; i < rep_size; ++i) { + ASSERT_EQ(i, rep_buf[i]); + } +} + +TEST_F(RpcChannelTest, unknown_port) { + RpcChannelPtr channel = std::make_shared(_comm.get(), _conn_mgr, 0); + auto st = channel->init("127.0.0.1", 25438, 100, 100); + ASSERT_TRUE(st.ok()); + + EventPtr event = std::make_shared(Event::DISCONNECT); + channel->handle(event); + uint8_t buf[16]; + memset(buf, 0, 16); + st = channel->send_message(buf, 16); + ASSERT_TRUE(st.ok()); + const uint8_t* rep_buf = nullptr; + uint32_t rep_size = 0; + st = channel->get_response(&rep_buf, &rep_size); + ASSERT_FALSE(st.ok()); +} + +} + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/be/test/util/runtime_profile_test.cpp b/be/test/util/runtime_profile_test.cpp index fe015ccedc..a10ac76be3 100644 --- a/be/test/util/runtime_profile_test.cpp +++ b/be/test/util/runtime_profile_test.cpp @@ -18,9 +18,9 @@ #include #include #include -#include "common/object-pool.h" -#include "util/runtime-profile.h" -#include "util/cpu-info.h" +#include "common/object_pool.h" +#include "util/runtime_profile.h" +#include "util/cpu_info.h" using namespace std; using namespace boost; diff --git a/be/test/util/system_metrics_test.cpp b/be/test/util/system_metrics_test.cpp new file mode 100644 index 0000000000..5bc43e34d1 --- /dev/null +++ b/be/test/util/system_metrics_test.cpp @@ -0,0 +1,281 @@ +// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "util/system_metrics.h" + +#include +#include + +#include "common/config.h" +#include "util/logging.h" +#include "util/metrics.h" +#include "util/stopwatch.hpp" + +namespace palo { + +class SystemMetricsTest : public testing::Test { +public: + SystemMetricsTest() { } + virtual ~SystemMetricsTest() { + } +}; + +class TestMetricsVisitor : public MetricsVisitor { +public: + virtual ~TestMetricsVisitor() { } + void visit(const std::string& prefix, const std::string& name, + MetricCollector* collector) { + for (auto& it : collector->metrics()) { + Metric* metric = it.second; + auto& labels = it.first; + switch (metric->type()) { + case MetricType::GAUGE: + case MetricType::COUNTER: { + bool has_prev = false; + if (!prefix.empty()) { + _ss << prefix; + has_prev = true; + } + if (!name.empty()) { + if (has_prev) { + _ss << "_"; + } + _ss << name; + } + if (!labels.empty()) { + if (has_prev) { + _ss << "{"; + } + _ss << labels.to_string(); + if (has_prev) { + _ss << "}"; + } + } + _ss << " " << ((SimpleMetric*)metric)->to_string() << std::endl; + break; + } + default: + break; + } + } + } + std::string to_string() { + return _ss.str(); + } +private: + std::stringstream _ss; +}; + +extern const char* k_ut_stat_path; +extern const char* k_ut_diskstats_path; +extern const char* k_ut_net_dev_path; + +TEST_F(SystemMetricsTest, normal) { + MetricRegistry registry("test"); + { + char buf[1024]; + readlink("/proc/self/exe", buf, 1023); + char* dir_path = dirname(buf); + std::string stat_path(dir_path); + stat_path += "/test_data/stat_normal"; + LOG(INFO) << stat_path; + k_ut_stat_path = stat_path.c_str(); + std::string diskstats_path(dir_path); + diskstats_path += "/test_data/diskstats_normal"; + k_ut_diskstats_path = diskstats_path.c_str(); + std::string net_dev_path(dir_path); + net_dev_path += "/test_data/net_dev_normal"; + k_ut_net_dev_path = net_dev_path.c_str(); + + std::set disk_devices; + disk_devices.emplace("sda"); + std::vector network_interfaces; + network_interfaces.emplace_back("xgbe0"); + SystemMetrics metrics; + metrics.install(®istry, disk_devices, network_interfaces); + + TestMetricsVisitor visitor; + registry.collect(&visitor); + LOG(INFO) << "\n" << visitor.to_string(); + + // cpu + SimpleMetric* cpu_user = (SimpleMetric*)registry.get_metric( + "cpu", MetricLabels().add("mode", "user")); + ASSERT_TRUE(cpu_user != nullptr); + ASSERT_STREQ("57199151", cpu_user->to_string().c_str()); + SimpleMetric* cpu_nice = (SimpleMetric*)registry.get_metric( + "cpu", MetricLabels().add("mode", "nice")); + ASSERT_TRUE(cpu_nice != nullptr); + ASSERT_STREQ("2616310", cpu_nice->to_string().c_str()); + SimpleMetric* cpu_system = (SimpleMetric*)registry.get_metric( + "cpu", MetricLabels().add("mode", "system")); + ASSERT_TRUE(cpu_system != nullptr); + ASSERT_STREQ("10600935", cpu_system->to_string().c_str()); + SimpleMetric* cpu_idle = (SimpleMetric*)registry.get_metric( + "cpu", MetricLabels().add("mode", "idle")); + ASSERT_TRUE(cpu_idle != nullptr); + ASSERT_STREQ("1517505423", cpu_idle->to_string().c_str()); + SimpleMetric* cpu_iowait = (SimpleMetric*)registry.get_metric( + "cpu", MetricLabels().add("mode", "iowait")); + ASSERT_TRUE(cpu_iowait != nullptr); + ASSERT_STREQ("2137148", cpu_iowait->to_string().c_str()); + SimpleMetric* cpu_irq = (SimpleMetric*)registry.get_metric( + "cpu", MetricLabels().add("mode", "irq")); + ASSERT_TRUE(cpu_irq != nullptr); + ASSERT_STREQ("0", cpu_irq->to_string().c_str()); + SimpleMetric* cpu_softirq = (SimpleMetric*)registry.get_metric( + "cpu", MetricLabels().add("mode", "soft_irq")); + ASSERT_TRUE(cpu_softirq != nullptr); + ASSERT_STREQ("108277", cpu_softirq->to_string().c_str()); + SimpleMetric* cpu_steal = (SimpleMetric*)registry.get_metric( + "cpu", MetricLabels().add("mode", "steal")); + ASSERT_TRUE(cpu_steal != nullptr); + ASSERT_STREQ("0", cpu_steal->to_string().c_str()); + SimpleMetric* cpu_guest = (SimpleMetric*)registry.get_metric( + "cpu", MetricLabels().add("mode", "guest")); + ASSERT_TRUE(cpu_guest != nullptr); + ASSERT_STREQ("0", cpu_guest->to_string().c_str()); + // memroy + SimpleMetric* memory_allocated_bytes = (SimpleMetric*)registry.get_metric( + "memory_allocated_bytes"); + ASSERT_TRUE(memory_allocated_bytes != nullptr); + // network + SimpleMetric* receive_bytes = (SimpleMetric*)registry.get_metric( + "network_receive_bytes", MetricLabels().add("device", "xgbe0")); + ASSERT_TRUE(receive_bytes != nullptr); + ASSERT_STREQ("52567436039", receive_bytes->to_string().c_str()); + SimpleMetric* receive_packets = (SimpleMetric*)registry.get_metric( + "network_receive_packets", MetricLabels().add("device", "xgbe0")); + ASSERT_TRUE(receive_packets != nullptr); + ASSERT_STREQ("65066152", receive_packets->to_string().c_str()); + SimpleMetric* send_bytes = (SimpleMetric*)registry.get_metric( + "network_send_bytes", MetricLabels().add("device", "xgbe0")); + ASSERT_TRUE(send_bytes != nullptr); + ASSERT_STREQ("45480856156", send_bytes->to_string().c_str()); + SimpleMetric* send_packets = (SimpleMetric*)registry.get_metric( + "network_send_packets", MetricLabels().add("device", "xgbe0")); + ASSERT_TRUE(send_packets != nullptr); + ASSERT_STREQ("88277614", send_packets->to_string().c_str()); + // disk + SimpleMetric* bytes_read = (SimpleMetric*)registry.get_metric( + "disk_bytes_read", MetricLabels().add("device", "sda")); + ASSERT_TRUE(bytes_read != nullptr); + ASSERT_STREQ("20142745600", bytes_read->to_string().c_str()); + SimpleMetric* reads_completed = (SimpleMetric*)registry.get_metric( + "disk_reads_completed", MetricLabels().add("device", "sda")); + ASSERT_TRUE(reads_completed != nullptr); + ASSERT_STREQ("759548", reads_completed->to_string().c_str()); + SimpleMetric* read_time_ms = (SimpleMetric*)registry.get_metric( + "disk_read_time_ms", MetricLabels().add("device", "sda")); + ASSERT_TRUE(read_time_ms != nullptr); + ASSERT_STREQ("4308146", read_time_ms->to_string().c_str()); + + SimpleMetric* bytes_written = (SimpleMetric*)registry.get_metric( + "disk_bytes_written", MetricLabels().add("device", "sda")); + ASSERT_TRUE(bytes_written != nullptr); + ASSERT_STREQ("1624753500160", bytes_written->to_string().c_str()); + SimpleMetric* writes_completed = (SimpleMetric*)registry.get_metric( + "disk_writes_completed", MetricLabels().add("device", "sda")); + ASSERT_TRUE(writes_completed != nullptr); + ASSERT_STREQ("18282936", writes_completed->to_string().c_str()); + SimpleMetric* write_time_ms = (SimpleMetric*)registry.get_metric( + "disk_write_time_ms", MetricLabels().add("device", "sda")); + ASSERT_TRUE(write_time_ms != nullptr); + ASSERT_STREQ("1907755230", write_time_ms->to_string().c_str()); + SimpleMetric* io_time_ms = (SimpleMetric*)registry.get_metric( + "disk_io_time_ms", MetricLabels().add("device", "sda")); + ASSERT_TRUE(io_time_ms != nullptr); + ASSERT_STREQ("19003350", io_time_ms->to_string().c_str()); + SimpleMetric* io_time_weigthed = (SimpleMetric*)registry.get_metric( + "disk_io_time_weigthed", MetricLabels().add("device", "sda")); + ASSERT_TRUE(write_time_ms != nullptr); + ASSERT_STREQ("1912122964", io_time_weigthed->to_string().c_str()); + } + { + TestMetricsVisitor visitor; + registry.collect(&visitor); + ASSERT_TRUE(visitor.to_string().empty()); + + Metric* cpu_idle = registry.get_metric("cpu", MetricLabels().add("mode", "idle")); + ASSERT_TRUE(cpu_idle == nullptr); + Metric* cpu_user = registry.get_metric("cpu", MetricLabels().add("mode", "user")); + ASSERT_TRUE(cpu_user == nullptr); + Metric* memory_allocated_bytes = registry.get_metric("memory_allocated_bytes"); + ASSERT_TRUE(memory_allocated_bytes == nullptr); + } +} + +TEST_F(SystemMetricsTest, no_proc_file) { + MetricRegistry registry("test"); + { + char buf[1024]; + readlink("/proc/self/exe", buf, 1023); + char* dir_path = dirname(buf); + std::string stat_path(dir_path); + stat_path += "/test_data/no_stat_normal"; + LOG(INFO) << stat_path; + k_ut_stat_path = stat_path.c_str(); + std::string diskstats_path(dir_path); + diskstats_path += "/test_data/no_diskstats_normal"; + k_ut_diskstats_path = diskstats_path.c_str(); + std::string net_dev_path(dir_path); + net_dev_path += "/test_data/no_net_dev_normal"; + k_ut_net_dev_path = net_dev_path.c_str(); + + std::set disk_devices; + disk_devices.emplace("sda"); + std::vector network_interfaces; + network_interfaces.emplace_back("xgbe0"); + SystemMetrics metrics; + metrics.install(®istry, disk_devices, network_interfaces); + + TestMetricsVisitor visitor; + registry.collect(&visitor); + LOG(INFO) << "\n" << visitor.to_string(); + + // cpu + SimpleMetric* cpu_user = (SimpleMetric*)registry.get_metric( + "cpu", MetricLabels().add("mode", "user")); + ASSERT_TRUE(cpu_user != nullptr); + ASSERT_STREQ("0", cpu_user->to_string().c_str()); + // memroy + SimpleMetric* memory_allocated_bytes = (SimpleMetric*)registry.get_metric( + "memory_allocated_bytes"); + ASSERT_TRUE(memory_allocated_bytes != nullptr); + // network + SimpleMetric* receive_bytes = (SimpleMetric*)registry.get_metric( + "network_receive_bytes", MetricLabels().add("device", "xgbe0")); + ASSERT_TRUE(receive_bytes != nullptr); + ASSERT_STREQ("0", receive_bytes->to_string().c_str()); + // disk + SimpleMetric* bytes_read = (SimpleMetric*)registry.get_metric( + "disk_bytes_read", MetricLabels().add("device", "sda")); + ASSERT_TRUE(bytes_read != nullptr); + ASSERT_STREQ("0", bytes_read->to_string().c_str()); + } +} + +} + +int main(int argc, char** argv) { + std::string conffile = std::string(getenv("PALO_HOME")) + "/conf/be.conf"; + if (!palo::config::init(conffile.c_str(), false)) { + fprintf(stderr, "error read config file. \n"); + return -1; + } + palo::init_glog("be-test"); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/be/test/util/test_data/diskstats_normal b/be/test/util/test_data/diskstats_normal new file mode 100644 index 0000000000..a9482272b6 --- /dev/null +++ b/be/test/util/test_data/diskstats_normal @@ -0,0 +1,34 @@ + 1 0 ram0 0 0 0 0 0 0 0 0 0 0 0 + 1 1 ram1 0 0 0 0 0 0 0 0 0 0 0 + 1 2 ram2 0 0 0 0 0 0 0 0 0 0 0 + 1 3 ram3 0 0 0 0 0 0 0 0 0 0 0 + 1 4 ram4 0 0 0 0 0 0 0 0 0 0 0 + 1 5 ram5 0 0 0 0 0 0 0 0 0 0 0 + 1 6 ram6 0 0 0 0 0 0 0 0 0 0 0 + 1 7 ram7 0 0 0 0 0 0 0 0 0 0 0 + 1 8 ram8 0 0 0 0 0 0 0 0 0 0 0 + 1 9 ram9 0 0 0 0 0 0 0 0 0 0 0 + 1 10 ram10 0 0 0 0 0 0 0 0 0 0 0 + 1 11 ram11 0 0 0 0 0 0 0 0 0 0 0 + 1 12 ram12 0 0 0 0 0 0 0 0 0 0 0 + 1 13 ram13 0 0 0 0 0 0 0 0 0 0 0 + 1 14 ram14 0 0 0 0 0 0 0 0 0 0 0 + 1 15 ram15 0 0 0 0 0 0 0 0 0 0 0 + 8 112 sdh 1402 587 9740 8662 56 78 258 5 0 4428 8666 + 8 113 sdh1 1269 586 8668 7209 42 78 258 5 0 3102 7213 + 8 96 sdg 1389 600 9740 5562 56 78 258 28 0 4438 5590 + 8 97 sdg1 1256 599 8668 3762 42 78 258 26 0 2667 3788 + 8 48 sdd 1279 223 12004 6708 36 171 1544 27 0 6719 6733 + 8 49 sdd1 587 211 6372 620 22 171 1544 26 0 631 645 + 8 32 sdc 556988 11622 94086908 3935795 1819721 33250478 280645224 94064440 0 2495078 98006563 + 8 33 sdc1 556296 11610 94081276 3928740 1818030 33250478 280645224 94064268 0 2488359 97999336 + 8 16 sdb 1279 223 12004 6832 36 171 1544 24 0 6843 6855 + 8 17 sdb1 587 211 6372 880 22 171 1544 24 0 892 904 + 8 80 sdf 1276 223 11980 5617 26 168 1440 8 0 5614 5624 + 8 81 sdf1 584 211 6348 1064 12 168 1440 8 0 1062 1072 + 8 64 sde 1276 223 11980 6799 26 168 1440 13 0 6799 6812 + 8 65 sde1 584 211 6348 1415 12 168 1440 13 0 1415 1428 + 8 0 sda 759548 94747 39341300 4308146 18282936 366067539 3173346680 1907755230 0 19003350 1912122964 + 8 1 sda1 123 0 984 852 0 0 0 0 0 852 852 + 8 2 sda2 27828 434 1999858 440269 2148105 176690 120364864 58395605 0 2542454 58835289 + 8 3 sda3 731010 94303 37335682 3860003 15717840 365890849 3052981816 1849318320 0 18504965 1853238847 diff --git a/be/test/util/test_data/net_dev_normal b/be/test/util/test_data/net_dev_normal new file mode 100644 index 0000000000..72ef4dd377 --- /dev/null +++ b/be/test/util/test_data/net_dev_normal @@ -0,0 +1,6 @@ +Inter-| Receive | Transmit + face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed + lo:67198081903 62228221 0 0 0 0 0 0 67198081903 62228221 0 0 0 0 0 0 + eth0: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + eth1: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + xgbe0:52567436039 65066152 0 0 0 0 0 263563 45480856156 88277614 0 0 0 0 0 0 diff --git a/be/test/util/test_data/stat_normal b/be/test/util/test_data/stat_normal new file mode 100644 index 0000000000..be5c4d0be7 --- /dev/null +++ b/be/test/util/test_data/stat_normal @@ -0,0 +1,40 @@ +cpu 57199151 2616310 10600935 1517505423 2137148 0 108277 0 0 +cpu0 1857959 843 417050 46622765 726528 0 67523 0 0 +cpu1 1448971 178220 821126 47188721 53514 0 2229 0 0 +cpu2 1293706 435282 573300 47355625 32321 0 2547 0 0 +cpu3 1119358 136064 311413 48102986 21111 0 1846 0 0 +cpu4 5725308 49756 287655 43611621 18271 0 163 0 0 +cpu5 4563178 28230 210153 44875816 15278 0 116 0 0 +cpu6 3315423 20057 162762 46179021 15418 0 87 0 0 +cpu7 1818913 15813 134122 47710122 13723 0 72 0 0 +cpu8 1818748 211011 680493 46302318 658125 0 22023 0 0 +cpu9 1931996 478511 1147684 46085783 46588 0 2188 0 0 +cpu10 2121771 148785 512584 46882843 26091 0 647 0 0 +cpu11 2583067 68171 357044 46663959 20215 0 288 0 0 +cpu12 2002303 29815 244647 47400525 15203 0 247 0 0 +cpu13 1704931 20692 187845 47766178 12938 0 152 0 0 +cpu14 1229859 21146 139682 48290298 11612 0 140 0 0 +cpu15 1080223 19012 121014 48458977 13339 0 112 0 0 +cpu16 1773329 8693 152456 47593449 164286 0 477 0 0 +cpu17 1154788 67934 214196 48244789 10786 0 229 0 0 +cpu18 1216739 155979 486022 47826726 6881 0 377 0 0 +cpu19 2899599 66056 255659 46464162 7087 0 158 0 0 +cpu20 3748997 27444 111471 45800119 4631 0 55 0 0 +cpu21 2756406 15013 72464 46845272 3519 0 38 0 0 +cpu22 1392591 11344 57292 48227061 4393 0 31 0 0 +cpu23 866409 7897 47801 48766197 4369 0 34 0 0 +cpu24 1213557 82818 215125 48026562 153966 0 676 0 0 +cpu25 598053 161519 474195 48452419 5916 0 598 0 0 +cpu26 741578 87545 205168 48652532 5600 0 276 0 0 +cpu27 591082 25633 156052 48895031 24799 0 96 0 0 +cpu28 600102 12345 59937 49015936 4314 0 58 0 0 +cpu29 478329 8254 51041 49150167 4490 0 408 0 0 +cpu30 539254 10142 185346 48931824 25826 0 291 0 0 +cpu31 1012611 6269 1548122 47115604 5993 0 4081 0 0 +intr 20935913098 223 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 63 0 0 0 0 0 0 32 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1656 0 0 0 0 0 20283663 0 0 248482 0 248481 63119261 3789166 3084823 2750100 2669596 2679925 2699219 2713070 3674365 2862144 2687438 3804257 3746619 2691619 2750823 2793663 1154789 409339 342208 349319 410573 436073 466973 446130 434789 367860 396182 1263273 418746 1593956 6004810 3076068 486 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +ctxt 11043516832 +btime 1515573799 +processes 72044703 +procs_running 1 +procs_blocked 0 +softirq 38919180590 0 3019187263 99450 154925206 21173060 0 20278807 2302181339 98 3336564295 diff --git a/be/test/util/types_test.cpp b/be/test/util/types_test.cpp new file mode 100644 index 0000000000..b8d7700e95 --- /dev/null +++ b/be/test/util/types_test.cpp @@ -0,0 +1,59 @@ +// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "util/types.h" + +#include + +#include "runtime/large_int_value.h" + +namespace palo { + +class TypesTest : public ::testing::Test { +protected: + TypesTest() { + } + virtual ~TypesTest() { + } +}; + +TEST_F(TypesTest, packed_int128) { + // check align + ASSERT_EQ(1, alignof(PackedInt128)); + + // check assign + __int128 test_value = 123456789987654321; + test_value *= 1000000000000000000UL; + test_value += 123456789987654321UL; + char buf[30]; + *reinterpret_cast(buf + 1) = test_value; + ASSERT_EQ(reinterpret_cast(buf + 1)->value, test_value); + LOG(INFO) << reinterpret_cast(buf + 1)->value; + { + char buf2[64]; + *reinterpret_cast(buf2 + 7) = *reinterpret_cast(buf + 1); + reinterpret_cast(buf2 + 7)->value += 100; + ASSERT_EQ(reinterpret_cast(buf2 + 7)->value, test_value + 100); + LOG(INFO) << reinterpret_cast(buf2 + 7)->value; + } +} + +} + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + diff --git a/be/test/util/url_coding_test.cpp b/be/test/util/url_coding_test.cpp index e2c26f8592..b1b93b3ef5 100644 --- a/be/test/util/url_coding_test.cpp +++ b/be/test/util/url_coding_test.cpp @@ -17,7 +17,7 @@ #include #include #include -#include "util/url-coding.h" +#include "util/url_coding.h" #include "util/logging.h" namespace palo { diff --git a/docs/help/Contents/Account Management/help.md b/docs/help/Contents/Account Management/help.md index 5ed74f2524..2b5cd75486 100644 --- a/docs/help/Contents/Account Management/help.md +++ b/docs/help/Contents/Account Management/help.md @@ -114,17 +114,22 @@ Syntax: SET PROPERTY [FOR 'user'] 'key' = 'value' [, 'key' = 'value'] - 设置用户的属性,包括分é…给用户的资æºç­‰ã€‚ + 设置用户的属性,包括分é…给用户的资æºã€å¯¼å…¥cluster等。 key: 超级用户æƒé™: max_user_connections: 最大连接数。 resource.cpu_share: cpu资æºåˆ†é…。 + load_cluster.{cluster_name}.priority: 为指定的cluster分é…优先级,å¯ä»¥ä¸º HIGH 或 NORMAL 普通用户æƒé™ï¼š quota.normal: normal级别的资æºåˆ†é…。 quota.high: high级别的资æºåˆ†é…。 quota.low: low级别的资æºåˆ†é…。 + load_cluster.{cluster_name}.hadoop_palo_path: palo使用的hadoop目录,需è¦å­˜æ”¾etl程åºåŠetl生æˆçš„中间数æ®ä¾›palo导入。导入完æˆåŽä¼šè‡ªåŠ¨æ¸…ç†ä¸­é—´æ•°æ®ï¼Œetl程åºè‡ªåЍä¿ç•™ä¸‹æ¬¡ä½¿ç”¨ã€‚ + load_cluster.{cluster_name}.hadoop_configs: hadoopçš„é…置,其中fs.default.nameã€mapred.job.trackerã€hadoop.job.ugi必须填写。 + load_cluster.{cluster_name}.hadoop_http_port: hadoop hdfs name node http端å£ï¼Œé»˜è®¤ä¸º8070。 + default_load_cluster: 默认的导入cluster。 ## example 1. 修改用户 jack 最大连接数为1000 @@ -136,6 +141,20 @@ 3. 修改 jack 用户的normal组的æƒé‡ SET PROPERTY FOR 'jack' 'quota.normal' = '400'; + 4. 为用户 jack 添加导入cluster + SET PROPERTY FOR 'jack' + 'load_cluster.{cluster_name}.hadoop_palo_path' = '/user/palo/palo_path', + 'load_cluster.{cluster_name}.hadoop_configs' = 'fs.default.name=hdfs://dpp.cluster.com:port;mapred.job.tracker=dpp.cluster.com:port;hadoop.job.ugi=user,password;mapred.job.queue.name=job_queue_name_in_hadoop;mapred.job.priority=HIGH;'; + + 5. 删除用户 jack 下的导入cluster。 + SET PROPERTY FOR 'jack' 'load_cluster.{cluster_name}' = NULL; + + 6. 修改用户 jack 默认的导入cluster + SET PROPERTY FOR 'jack' 'default_load_cluster' = '{cluster_name}'; + + 7. 修改用户 jack 的集群优先级为 HIGH + SET PROPERTY FOR 'jack' 'load_cluster.{cluster_name}.priority' = 'HIGH'; + ## keyword SET, PROPERTY diff --git a/docs/help/Contents/Data Manipulation/manipulation_stmt.md b/docs/help/Contents/Data Manipulation/manipulation_stmt.md index 48674f6504..0ca29466d5 100644 --- a/docs/help/Contents/Data Manipulation/manipulation_stmt.md +++ b/docs/help/Contents/Data Manipulation/manipulation_stmt.md @@ -338,8 +338,8 @@ max_filter_ratio: 用于指定å…许过滤ä¸è§„范数æ®çš„æœ€å¤§æ¯”例,默认是0,ä¸å…许过滤 自定义指定应该如下:'max_filter_ratio=0.2',å«ä¹‰æ˜¯å…许20%的错误率 - - timeout: 指定 load 作业的超时时间,å•使˜¯ç§’。当loadæ‰§è¡Œæ—¶é—´è¶…è¿‡è¯¥é˜ˆå€¼æ—¶ï¼Œä¼šè‡ªåŠ¨å–æ¶ˆã€‚默认超时时间是 86400 秒。 + + timeout: 指定 load 作业的超时时间,å•使˜¯ç§’。当loadæ‰§è¡Œæ—¶é—´è¶…è¿‡è¯¥é˜ˆå€¼æ—¶ï¼Œä¼šè‡ªåŠ¨å–æ¶ˆã€‚默认超时时间是 86400 秒。 建议指定 timeout æ—¶é—´å°äºŽ 86400 秒。 hll: 用于指定数æ®é‡Œé¢å’Œè¡¨é‡Œé¢çš„HLL列的对应关系,表中的列和数æ®é‡Œé¢æŒ‡å®šçš„列 @@ -362,7 +362,7 @@ curl --location-trusted -u root -T testData http://host:port/api/testDb/testTbl/_load?label=123 2. 将本地文件'testData'中的数æ®å¯¼å…¥åˆ°æ•°æ®åº“'testDb'中'testTbl'的表(用户是test_cluster中的)。超时时间是 3600 ç§’ - curl --location-trusted -u root@test_cluster:root -T testData http://fe.host:port/api/testDb/testTbl/_load?label=123i\&timeout=3600 + curl --location-trusted -u root@test_cluster:root -T testData http://fe.host:port/api/testDb/testTbl/_load?label=123&timeout=3600 3. 将本地文件'testData'中的数æ®å¯¼å…¥åˆ°æ•°æ®åº“'testDb'中'testTbl'的表, å…许20%的错误率(用户是defalut_cluster中的) curl --location-trusted -u root -T testData http://host:port/api/testDb/testTbl/_load?label=123\&max_filter_ratio=0.2 diff --git a/docs/resources/fe_page_index.png b/docs/resources/fe_page_index.png deleted file mode 100644 index f88702d341..0000000000 Binary files a/docs/resources/fe_page_index.png and /dev/null differ diff --git a/docs/resources/fe_page_logs.png b/docs/resources/fe_page_logs.png deleted file mode 100644 index 565a3d8a7c..0000000000 Binary files a/docs/resources/fe_page_logs.png and /dev/null differ diff --git a/docs/resources/fe_page_queries.png b/docs/resources/fe_page_queries.png deleted file mode 100644 index 023389fca5..0000000000 Binary files a/docs/resources/fe_page_queries.png and /dev/null differ diff --git a/docs/resources/fe_page_sessions.png b/docs/resources/fe_page_sessions.png deleted file mode 100644 index 3b7b6b0b11..0000000000 Binary files a/docs/resources/fe_page_sessions.png and /dev/null differ diff --git a/docs/resources/fe_page_system.png b/docs/resources/fe_page_system.png deleted file mode 100644 index 829cd6b328..0000000000 Binary files a/docs/resources/fe_page_system.png and /dev/null differ diff --git a/docs/resources/fe_page_system_access.png b/docs/resources/fe_page_system_access.png deleted file mode 100644 index f7e63cde9f..0000000000 Binary files a/docs/resources/fe_page_system_access.png and /dev/null differ diff --git a/docs/resources/fe_page_system_backends.png b/docs/resources/fe_page_system_backends.png deleted file mode 100644 index 96aa9070f4..0000000000 Binary files a/docs/resources/fe_page_system_backends.png and /dev/null differ diff --git a/docs/resources/fe_page_system_brokers.png b/docs/resources/fe_page_system_brokers.png deleted file mode 100644 index 2c0691c486..0000000000 Binary files a/docs/resources/fe_page_system_brokers.png and /dev/null differ diff --git a/docs/resources/fe_page_system_dbs.png b/docs/resources/fe_page_system_dbs.png deleted file mode 100644 index 0a8ec0768a..0000000000 Binary files a/docs/resources/fe_page_system_dbs.png and /dev/null differ diff --git a/docs/resources/fe_page_system_error_hub.png b/docs/resources/fe_page_system_error_hub.png deleted file mode 100644 index b2208240b4..0000000000 Binary files a/docs/resources/fe_page_system_error_hub.png and /dev/null differ diff --git a/docs/resources/fe_page_system_frontends.png b/docs/resources/fe_page_system_frontends.png deleted file mode 100644 index c7fb35cf21..0000000000 Binary files a/docs/resources/fe_page_system_frontends.png and /dev/null differ diff --git a/docs/resources/fe_page_system_jobs.png b/docs/resources/fe_page_system_jobs.png deleted file mode 100644 index 16af785493..0000000000 Binary files a/docs/resources/fe_page_system_jobs.png and /dev/null differ diff --git a/docs/resources/fe_page_system_statistic.png b/docs/resources/fe_page_system_statistic.png deleted file mode 100644 index 5e5cb04052..0000000000 Binary files a/docs/resources/fe_page_system_statistic.png and /dev/null differ diff --git a/docs/resources/fe_page_system_tasks.png b/docs/resources/fe_page_system_tasks.png deleted file mode 100644 index b791fffcd9..0000000000 Binary files a/docs/resources/fe_page_system_tasks.png and /dev/null differ diff --git a/fe/src/com/baidu/palo/PaloFe.java b/fe/src/com/baidu/palo/PaloFe.java index 8882420584..d981932064 100644 --- a/fe/src/com/baidu/palo/PaloFe.java +++ b/fe/src/com/baidu/palo/PaloFe.java @@ -15,9 +15,14 @@ package com.baidu.palo; +import com.baidu.palo.catalog.Catalog; +import com.baidu.palo.common.CommandLineOptions; import com.baidu.palo.common.Config; import com.baidu.palo.common.Log4jConfig; +import com.baidu.palo.common.Version; import com.baidu.palo.http.HttpServer; +import com.baidu.palo.journal.bdbje.BDBTool; +import com.baidu.palo.journal.bdbje.BDBToolOptions; import com.baidu.palo.qe.QeService; import com.baidu.palo.service.ExecuteEnv; import com.baidu.palo.service.FeServer; @@ -26,6 +31,11 @@ import com.baidu.palo.service.FrontendOptions; import com.google.common.base.Charsets; import com.google.common.base.Strings; +import org.apache.commons.cli.BasicParser; +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.CommandLineParser; +import org.apache.commons.cli.Options; +import org.apache.commons.cli.ParseException; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -44,6 +54,9 @@ public class PaloFe { // entrance for palo frontend public static void main(String[] args) { + CommandLineOptions cmdLineOpts = parseArgs(args); + System.out.println(cmdLineOpts.toString()); + try { final String paloHome = System.getenv("PALO_HOME"); if (Strings.isNullOrEmpty(paloHome)) { @@ -56,14 +69,17 @@ public class PaloFe { throw new IOException("pid file is already locked."); } - // set dns cache ttl - java.security.Security.setProperty("networkaddress.cache.ttl" , "60"); - // init config new Config().init(paloHome + "/conf/fe.conf"); Log4jConfig.initLogging(); - LOG.info("Palo FE start"); + // set dns cache ttl + java.security.Security.setProperty("networkaddress.cache.ttl" , "60"); + + // check command line options + checkCommandLineOptions(cmdLineOpts); + + LOG.info("Palo FE starting..."); FrontendOptions.init(); ExecuteEnv.setup(); @@ -96,6 +112,136 @@ public class PaloFe { } } // end PaloFe main() + /* + * -v --version + * Print the version of Palo Frontend + * -h --helper + * Specify the helper node when joining a bdb je replication group + * -b --bdb + * Run bdbje debug tools + * + * -l --listdb + * List all database names in bdbje + * -d --db + * Specify a database in bdbje + * + * -s --stat + * Print statistic of a database, including count, first key, last key + * -f --from + * Specify the start scan key + * -t --to + * Specify the end scan key + * -m --metaversion + * Specify the meta version to decode log value + * + */ + private static CommandLineOptions parseArgs(String[] args) { + CommandLineParser commandLineParser = new BasicParser(); + Options options = new Options(); + options.addOption("v", "version", false, "Print the version of Palo Frontend"); + options.addOption("h", "helper", true, "Specify the helper node when joining a bdb je replication group"); + options.addOption("b", "bdb", false, "Run bdbje debug tools"); + options.addOption("l", "listdb", false, "Run bdbje debug tools"); + options.addOption("d", "db", true, "Specify a database in bdbje"); + options.addOption("s", "stat", false, "Print statistic of a database, including count, first key, last key"); + options.addOption("f", "from", true, "Specify the start scan key"); + options.addOption("t", "to", true, "Specify the end scan key"); + options.addOption("m", "metaversion", true, "Specify the meta version to decode log value"); + + CommandLine cmd = null; + try { + cmd = commandLineParser.parse(options, args); + } catch (final ParseException e) { + e.printStackTrace(); + System.err.println("Failed to parse command line. exit now"); + System.exit(-1); + } + + // version + if (cmd.hasOption('v') || cmd.hasOption("version")) { + return new CommandLineOptions(true, "", null); + } else if (cmd.hasOption('b') || cmd.hasOption("bdb")) { + if (cmd.hasOption('l') || cmd.hasOption("listdb")) { + // list bdb je databases + BDBToolOptions bdbOpts = new BDBToolOptions(true, "", false, "", "", 0); + return new CommandLineOptions(false, "", bdbOpts); + } else if (cmd.hasOption('d') || cmd.hasOption("db")) { + // specify a database + String dbName = cmd.getOptionValue("db"); + if (Strings.isNullOrEmpty(dbName)) { + System.err.println("BDBJE database name is missing"); + System.exit(-1); + } + + if (cmd.hasOption('s') || cmd.hasOption("stat")) { + BDBToolOptions bdbOpts = new BDBToolOptions(false, dbName, true, "", "", 0); + return new CommandLineOptions(false, "", bdbOpts); + } else { + String fromKey = ""; + String endKey = ""; + int metaVersion = 0; + if (cmd.hasOption('f') || cmd.hasOption("from")) { + fromKey = cmd.getOptionValue("from"); + if (Strings.isNullOrEmpty(fromKey)) { + System.err.println("from key is missing"); + System.exit(-1); + } + } + if (cmd.hasOption('t') || cmd.hasOption("to")) { + endKey = cmd.getOptionValue("to"); + if (Strings.isNullOrEmpty(endKey)) { + System.err.println("end key is missing"); + System.exit(-1); + } + } + if (cmd.hasOption('m') || cmd.hasOption("metaversion")) { + try { + metaVersion = Integer.valueOf(cmd.getOptionValue("metaversion")); + } catch (NumberFormatException e) { + System.err.println("Invalid meta version format"); + System.exit(-1); + } + } + + BDBToolOptions bdbOpts = new BDBToolOptions(false, dbName, false, fromKey, endKey, metaVersion); + return new CommandLineOptions(false, "", bdbOpts); + } + } else { + System.err.println("Invalid options when running bdb je tools"); + System.exit(-1); + } + } else if (cmd.hasOption('h') || cmd.hasOption("helper")) { + String helperNode = cmd.getOptionValue("helper"); + if (Strings.isNullOrEmpty(helperNode)) { + System.err.println("Missing helper node"); + System.exit(-1); + } + return new CommandLineOptions(false, helperNode, null); + } + + // helper node is null, means no helper node is specified + return new CommandLineOptions(false, null, null); + } + + private static void checkCommandLineOptions(CommandLineOptions cmdLineOpts) { + if (cmdLineOpts.isVersion()) { + System.out.println("Build version: " + Version.PALO_BUILD_VERSION); + System.out.println("Build time: " + Version.PALO_BUILD_TIME); + System.out.println("Build info: " + Version.PALO_BUILD_INFO); + System.out.println("Build hash: " + Version.PALO_BUILD_HASH); + System.exit(0); + } else if (cmdLineOpts.runBdbTools()) { + BDBTool bdbTool = new BDBTool(Catalog.BDB_DIR, cmdLineOpts.getBdbToolOpts()); + if (bdbTool.run()) { + System.exit(0); + } else { + System.exit(-1); + } + } + + // go on + } + private static boolean createAndLockPidFile(String pidFilePath) throws IOException { File pid = new File(pidFilePath); RandomAccessFile file = new RandomAccessFile(pid, "rws"); diff --git a/fe/src/com/baidu/palo/alter/Alter.java b/fe/src/com/baidu/palo/alter/Alter.java index 69b85f8220..b4eaced493 100644 --- a/fe/src/com/baidu/palo/alter/Alter.java +++ b/fe/src/com/baidu/palo/alter/Alter.java @@ -94,6 +94,8 @@ public class Alter { boolean hasPartition = false; // rename ops, if has, should appear one and only one entry boolean hasRename = false; + // modify properties ops, if has, should appear one and only one entry + boolean hasModifyProp = false; // check conflict alter ops first List alterClauses = stmt.getOps(); @@ -122,24 +124,28 @@ public class Alter { && !hasRollup && !hasPartition && !hasRename) { hasSchemaChange = true; } else if (alterClause instanceof AddRollupClause && !hasSchemaChange && !hasRollup && !hasPartition - && !hasRename) { + && !hasRename && !hasModifyProp) { hasRollup = true; } else if (alterClause instanceof DropRollupClause && !hasSchemaChange && !hasRollup && !hasPartition - && !hasRename) { + && !hasRename && !hasModifyProp) { hasRollup = true; } else if (alterClause instanceof AddPartitionClause && !hasSchemaChange && !hasRollup && !hasPartition - && !hasRename) { + && !hasRename && !hasModifyProp) { hasPartition = true; } else if (alterClause instanceof DropPartitionClause && !hasSchemaChange && !hasRollup && !hasPartition - && !hasRename) { + && !hasRename && !hasModifyProp) { hasPartition = true; } else if (alterClause instanceof ModifyPartitionClause && !hasSchemaChange && !hasRollup - && !hasPartition && !hasRename) { + && !hasPartition && !hasRename && !hasModifyProp) { hasPartition = true; } else if ((alterClause instanceof TableRenameClause || alterClause instanceof RollupRenameClause || alterClause instanceof PartitionRenameClause || alterClause instanceof ColumnRenameClause) - && !hasSchemaChange && !hasRollup && !hasPartition && !hasRename) { + && !hasSchemaChange && !hasRollup && !hasPartition && !hasRename && !hasModifyProp) { hasRename = true; + } else if (alterClause instanceof ModifyTablePropertiesClause && !hasSchemaChange && !hasRollup + && !hasPartition + && !hasRename && !hasModifyProp) { + hasModifyProp = true; } else { throw new DdlException("Conflicting alter clauses. see help for more information"); } @@ -183,7 +189,7 @@ public class Alter { } } - if (hasSchemaChange) { + if (hasSchemaChange || hasModifyProp) { schemaChangeHandler.process(alterClauses, clusterName, db, olapTable); } else if (hasRollup) { rollupHandler.process(alterClauses, clusterName, db, olapTable); diff --git a/fe/src/com/baidu/palo/alter/RollupHandler.java b/fe/src/com/baidu/palo/alter/RollupHandler.java index d363ddc339..0da1f16ac2 100644 --- a/fe/src/com/baidu/palo/alter/RollupHandler.java +++ b/fe/src/com/baidu/palo/alter/RollupHandler.java @@ -280,10 +280,6 @@ public class RollupHandler extends AlterHandler { throw new DdlException(e.getMessage()); } - if (rollupStorageType == TStorageType.ROW) { - throw new DdlException("Can not add rollup with ROW storage type"); - } - // check storage type if has null column boolean hasNullColumn = false; for (Column column : rollupSchema) { diff --git a/fe/src/com/baidu/palo/alter/SchemaChangeHandler.java b/fe/src/com/baidu/palo/alter/SchemaChangeHandler.java index 77f936358f..6229fce888 100644 --- a/fe/src/com/baidu/palo/alter/SchemaChangeHandler.java +++ b/fe/src/com/baidu/palo/alter/SchemaChangeHandler.java @@ -789,6 +789,10 @@ public class SchemaChangeHandler extends AlterHandler { bfFpp = 0; } + // property 3 storage type + // from now on, we only support COLUMN storage type + TStorageType newStorageType = TStorageType.COLUMN; + // resource info TResourceInfo resourceInfo = null; if (ConnectContext.get() != null) { @@ -856,6 +860,14 @@ public class SchemaChangeHandler extends AlterHandler { } } + if (!needAlter) { + // check if storage type changed + TStorageType currentStorageType = olapTable.getStorageTypeByIndexId(alterIndexId); + if (currentStorageType != newStorageType) { + needAlter = true; + } + } + if (!needAlter) { LOG.debug("index[{}] is not changed. ignore", alterIndexId); continue; @@ -1005,6 +1017,9 @@ public class SchemaChangeHandler extends AlterHandler { throw new DdlException("Nothing is changed. please check your alter stmt."); } + // from now on, storage type can only be column + schemaChangeJob.setNewStorageType(TStorageType.COLUMN); + // the following operations are done outside the 'for indices' loop // to avoid partial check success @@ -1283,7 +1298,7 @@ public class SchemaChangeHandler extends AlterHandler { processReorderColumn((ReorderColumnsClause) alterClause, olapTable, indexSchemaMap); } else if (alterClause instanceof ModifyTablePropertiesClause) { // modify table properties - ; + // do nothing, properties are already in propertyMap } else { Preconditions.checkState(false); } diff --git a/fe/src/com/baidu/palo/alter/SchemaChangeJob.java b/fe/src/com/baidu/palo/alter/SchemaChangeJob.java index 8bb9db2c07..5547dcf8f3 100644 --- a/fe/src/com/baidu/palo/alter/SchemaChangeJob.java +++ b/fe/src/com/baidu/palo/alter/SchemaChangeJob.java @@ -105,6 +105,11 @@ public class SchemaChangeJob extends AlterJob { private Set bfColumns; private double bfFpp; + // Init as null, to be compatible with former schema change job. + // If this is set to null, storage type will remain what it was. + // This can only set to COLUMN + private TStorageType newStorageType = null; + private SchemaChangeJob() { this(-1, -1, null, null); } @@ -151,7 +156,7 @@ public class SchemaChangeJob extends AlterJob { // schema info public void setNewSchemaInfo(long indexId, int newSchemaVersion, int newSchemaHash, - short newShortKeyColumnCount) { + short newShortKeyColumnCount) { this.changedIndexIdToSchemaVersion.put(indexId, newSchemaVersion); this.changedIndexIdToSchemaHash.put(indexId, newSchemaHash); this.changedIndexIdToShortKeyColumnCount.put(indexId, newShortKeyColumnCount); @@ -188,6 +193,11 @@ public class SchemaChangeJob extends AlterJob { this.bfFpp = bfFpp; } + public void setNewStorageType(TStorageType newStorageType) { + Preconditions.checkState(newStorageType == TStorageType.COLUMN); + this.newStorageType = newStorageType; + } + public boolean isSchemaHashRelated(int schemaHash) { return changedIndexIdToSchemaHash.values().contains(schemaHash); } @@ -328,7 +338,6 @@ public class SchemaChangeJob extends AlterJob { int baseSchemaHash = olapTable.getSchemaHashByIndexId(indexId); short newShortKeyColumnCount = this.changedIndexIdToShortKeyColumnCount.get(indexId); Preconditions.checkState(newShortKeyColumnCount != (short) -1); - TStorageType storageType = olapTable.getStorageTypeByIndexId(indexId); KeysType keysType = olapTable.getKeysType(); TKeysType schemaChangeKeysType; if (keysType == KeysType.DUP_KEYS) { @@ -339,6 +348,8 @@ public class SchemaChangeJob extends AlterJob { schemaChangeKeysType = TKeysType.AGG_KEYS; } + TStorageType storageType = newStorageType == null ? olapTable.getStorageTypeByIndexId(indexId) + : newStorageType; for (Tablet tablet : alterIndex.getTablets()) { long tabletId = tablet.getId(); short replicaSendNum = 0; @@ -357,7 +368,8 @@ public class SchemaChangeJob extends AlterJob { partitionId, indexId, tabletId, replicaId, alterSchema, newSchemaHash, baseSchemaHash, newShortKeyColumnCount, - storageType, bfColumns, bfFpp, schemaChangeKeysType); + storageType, + bfColumns, bfFpp, schemaChangeKeysType); addReplicaId(indexId, replicaId, backendId); tasks.add(schemaChangeTask); replicaSendNum++; @@ -703,6 +715,10 @@ public class SchemaChangeJob extends AlterJob { short shortKeyColumnCount = changedIndexIdToShortKeyColumnCount.get(indexId); olapTable.setIndexSchemaInfo(indexId, null, entry.getValue(), schemaVersion, schemaHash, shortKeyColumnCount); + + if (newStorageType != null) { + olapTable.setIndexStorageType(indexId, newStorageType); + } } // 3. update base schema if changed @@ -820,6 +836,10 @@ public class SchemaChangeJob extends AlterJob { olapTable.setIndexSchemaInfo(indexId, null, entry.getValue(), schemaVersion, schemaHash, shortKeyColumnCount); + if (newStorageType != null) { + olapTable.setIndexStorageType(indexId, newStorageType); + } + if (indexId == olapTable.getId()) { olapTable.setNewBaseSchema(entry.getValue()); } @@ -932,6 +952,13 @@ public class SchemaChangeJob extends AlterJob { } else { out.writeBoolean(false); } + + // storage type + if (newStorageType == null) { + out.writeBoolean(false); + } else { + out.writeBoolean(true); + } } @Override @@ -990,6 +1017,12 @@ public class SchemaChangeJob extends AlterJob { bfFpp = in.readDouble(); } } + + if (Catalog.getCurrentCatalogJournalVersion() >= FeMetaVersion.VERSION_39) { + if (in.readBoolean()) { + newStorageType = TStorageType.COLUMN; + } + } } public static SchemaChangeJob read(DataInput in) throws IOException { diff --git a/fe/src/com/baidu/palo/analysis/AggregateInfo.java b/fe/src/com/baidu/palo/analysis/AggregateInfo.java index 8db80caec5..a4871934ad 100644 --- a/fe/src/com/baidu/palo/analysis/AggregateInfo.java +++ b/fe/src/com/baidu/palo/analysis/AggregateInfo.java @@ -32,6 +32,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import java.util.ArrayList; +import java.util.Iterator; import java.util.List; /** @@ -155,7 +156,7 @@ public final class AggregateInfo extends AggregateInfoBase { public List getPartitionExprs() { return partitionExprs_; } public void setPartitionExprs(List exprs) { partitionExprs_ = exprs; } - + /** * Creates complete AggregateInfo for groupingExprs and aggExprs, including * aggTupleDesc and aggTupleSMap. If parameter tupleDesc != null, sets aggTupleDesc to @@ -184,7 +185,17 @@ public final class AggregateInfo extends AggregateInfoBase { } } - if (distinctAggExprs.isEmpty()) { + // aggregation algorithm includes two kinds:one stage aggregation, tow stage aggregation. + // for case: + // 1: if aggExprs don't hava distinct or hava multi distinct , create aggregate info for + // one stage aggregation. + // 2: if aggExprs hava one distinct , create aggregate info for two stage aggregation + boolean isMultiDistinct = result.estimateIfContainsMultiDistinct(distinctAggExprs); + if (distinctAggExprs.isEmpty() + || isMultiDistinct) { + // It is used to map new aggr expr to old expr to help create an external + // reference to the aggregation node tuple + result.setIsMultiDistinct(isMultiDistinct); if (tupleDesc == null) { result.createTupleDescs(analyzer); result.createSmaps(analyzer); @@ -196,6 +207,7 @@ public final class AggregateInfo extends AggregateInfoBase { } result.createMergeAggInfo(analyzer); } else { + // case 2: // we don't allow you to pass in a descriptor for distinct aggregation // (we need two descriptors) Preconditions.checkState(tupleDesc == null); @@ -205,6 +217,53 @@ public final class AggregateInfo extends AggregateInfoBase { return result; } + + /** + * estimate if functions contains multi distinct + * @param distinctAggExprs + * @return + */ + public static boolean estimateIfContainsMultiDistinct(List distinctAggExprs) + throws AnalysisException { + + if (distinctAggExprs == null || distinctAggExprs.size() <= 0) { + return false; + } + + ArrayList expr0Children = Lists.newArrayList(); + if (distinctAggExprs.get(0).getFnName().getFunction().equalsIgnoreCase("group_concat")) { + // Ignore separator parameter, otherwise the same would have to be present for all + // other distinct aggregates as well. + // TODO: Deal with constant exprs more generally, instead of special-casing + // group_concat(). + expr0Children.add(distinctAggExprs.get(0).getChild(0).ignoreImplicitCast()); + } else { + for (Expr expr : distinctAggExprs.get(0).getChildren()) { + expr0Children.add(expr.ignoreImplicitCast()); + } + } + boolean hasMultiDistinct = false; + for (int i = 1; i < distinctAggExprs.size(); ++i) { + ArrayList exprIChildren = Lists.newArrayList(); + if (distinctAggExprs.get(i).getFnName().getFunction().equalsIgnoreCase("group_concat")) { + exprIChildren.add(distinctAggExprs.get(i).getChild(0).ignoreImplicitCast()); + } else { + for (Expr expr : distinctAggExprs.get(i).getChildren()) { + exprIChildren.add(expr.ignoreImplicitCast()); + } + } + if (!Expr.equalLists(expr0Children, exprIChildren)) { + if (exprIChildren.size() > 1 || expr0Children.size() > 1) { + throw new AnalysisException("The query contains multi count distinct or " + + "sum distinct, each can't have multi columns."); + } + hasMultiDistinct = true; + break; + } + } + return hasMultiDistinct; + } + /** * Create aggregate info for select block containing aggregate exprs with * DISTINCT clause. @@ -252,39 +311,13 @@ public final class AggregateInfo extends AggregateInfoBase { } } - for (int i = 1; i < distinctAggExprs.size(); ++i) { - ArrayList exprIChildren = Lists.newArrayList(); - if (distinctAggExprs.get(i).getFnName().getFunction().equalsIgnoreCase("group_concat")) { - exprIChildren.add(distinctAggExprs.get(i).getChild(0).ignoreImplicitCast()); - } else { - for (Expr expr : distinctAggExprs.get(i).getChildren()) { - exprIChildren.add(expr.ignoreImplicitCast()); - } - } - if (!Expr.equalLists(expr0Children, exprIChildren)) { - if (exprIChildren.size() > 1 || expr0Children.size() > 1) { - throw new AnalysisException("The query contains multi count distinct or " - + "sum distinct, each can't have multi columns."); - } - this.isMultiDistinct_ = true; - break; - } - } + this.isMultiDistinct_= estimateIfContainsMultiDistinct(distinctAggExprs); isDistinctAgg = true; // add DISTINCT parameters to grouping exprs if (!isMultiDistinct_) { groupingExprs_.addAll(expr0Children); - } else { - // TODO(zc) - int groupExprSize = groupingExprs_.size(); - for (int i = 0; i < distinctAggExprs.size(); ++i) { - groupingExprs_.addAll(distinctAggExprs.get(i).getChildren()); - firstIdx_.add(groupExprSize); - lastIdx_.add(groupExprSize + distinctAggExprs.get(i).getChildren().size()); - groupExprSize += distinctAggExprs.get(i).getChildren().size(); - } - } + } // remove DISTINCT aggregate functions from aggExprs aggregateExprs_.removeAll(distinctAggExprs); @@ -321,6 +354,10 @@ public final class AggregateInfo extends AggregateInfoBase { !secondPhaseDistinctAggInfo_.getAggregateExprs().isEmpty()); } + public void setIsMultiDistinct(boolean value) { + this.isMultiDistinct_ = value; + } + public boolean isMultiDistinct() { return isMultiDistinct_; } @@ -387,9 +424,6 @@ public final class AggregateInfo extends AggregateInfoBase { aggregateExprs_.add((FunctionCallExpr) substitutedAgg); } - if (LOG.isTraceEnabled()) { - LOG.trace("AggInfo: agg_exprs=" + Expr.debugString(aggregateExprs_)); - } outputTupleSmap_.substituteLhs(smap, analyzer); intermediateTupleSmap_.substituteLhs(smap, analyzer); if (secondPhaseDistinctAggInfo_ != null) { @@ -532,23 +566,8 @@ public final class AggregateInfo extends AggregateInfoBase { aggExpr = new FunctionCallExpr(inputExpr.getFnName(), Lists.newArrayList(aggExprParam)); } } else { - List params = new ArrayList(); - for (int i = firstIdx_.get(distinctExprPos); i < lastIdx_.get( - distinctExprPos); i++) { - params.add(new SlotRef(inputDesc.getSlots().get(i))); - } - distinctExprPos += 1; - if (inputExpr.getFnName().getFunction().equalsIgnoreCase("COUNT")) { - aggExpr = new FunctionCallExpr("COUNT_DISTINCT", - new FunctionParams(params)); - - } else if (inputExpr.getFnName().getFunction().equalsIgnoreCase("SUM")) { - aggExpr = new FunctionCallExpr("SUM_DISTINCT", - new FunctionParams(params)); - } else { - throw new AnalysisException(inputExpr.getFnName() + " can't support multi distinct."); - } - + // multi distinct can't run here + Preconditions.checkState(false); } secondPhaseAggExprs.add(aggExpr); } @@ -646,10 +665,12 @@ public final class AggregateInfo extends AggregateInfoBase { exprs.addAll(groupingExprs_); exprs.addAll(aggregateExprs_); for (int i = 0; i < exprs.size(); ++i) { - outputTupleSmap_.put(exprs.get(i).clone(), + Expr expr = exprs.get(i); + outputTupleSmap_.put(expr.clone(), new SlotRef(outputTupleDesc_.getSlots().get(i))); if (!requiresIntermediateTuple()) continue; - intermediateTupleSmap_.put(exprs.get(i).clone(), + + intermediateTupleSmap_.put(expr.clone(), new SlotRef(intermediateTupleDesc_.getSlots().get(i))); outputToIntermediateTupleSmap_.put( new SlotRef(outputTupleDesc_.getSlots().get(i)), @@ -702,8 +723,7 @@ public final class AggregateInfo extends AggregateInfoBase { outputTupleDesc_.getSlots().get(groupExprsSize + i); SlotDescriptor intermediateSlotDesc = intermediateTupleDesc_.getSlots().get(groupExprsSize + i); - - if (isDistinctAgg) { + if (isDistinctAgg || isMultiDistinct_) { slotDesc.setIsMaterialized(true); intermediateSlotDesc.setIsMaterialized(true); } diff --git a/fe/src/com/baidu/palo/analysis/FunctionCallExpr.java b/fe/src/com/baidu/palo/analysis/FunctionCallExpr.java index 43a491f802..16ed9bc3e9 100644 --- a/fe/src/com/baidu/palo/analysis/FunctionCallExpr.java +++ b/fe/src/com/baidu/palo/analysis/FunctionCallExpr.java @@ -447,9 +447,9 @@ public class FunctionCallExpr extends Expr { fn = getBuiltinFunction(analyzer, fnName.getFunction(), new Type[]{compatibleType}, Function.CompareMode.IS_NONSTRICT_SUPERTYPE_OF); - } else { + } else { fn = getBuiltinFunction(analyzer, fnName.getFunction(), collectChildReturnTypes(), - Function.CompareMode.IS_NONSTRICT_SUPERTYPE_OF); + Function.CompareMode.IS_NONSTRICT_SUPERTYPE_OF); } if (fn == null) { @@ -463,14 +463,21 @@ public class FunctionCallExpr extends Expr { } if (isAggregateFunction()) { + final String functionName = fnName.getFunction(); // subexprs must not contain aggregates if (Expr.containsAggregate(children)) { throw new AnalysisException( "aggregate function cannot contain aggregate parameters: " + this.toSql()); } - if (STDDEV_FUNCTION_SET.contains(fnName.getFunction()) && argTypes[0].isDateType()) { + + if (STDDEV_FUNCTION_SET.contains(functionName) && argTypes[0].isDateType()) { throw new AnalysisException("Stddev/variance function do not support Date/Datetime type"); } + + if (functionName.equalsIgnoreCase("multi_distinct_sum") && argTypes[0].isDateType()) { + throw new AnalysisException("Sum in multi distinct functions do not support Date/Datetime type"); + } + } else { if (fnParams.isStar()) { throw new AnalysisException("Cannot pass '*' to scalar function."); diff --git a/fe/src/com/baidu/palo/analysis/JoinOperator.java b/fe/src/com/baidu/palo/analysis/JoinOperator.java index 84408a4f65..6bbffb3ccc 100644 --- a/fe/src/com/baidu/palo/analysis/JoinOperator.java +++ b/fe/src/com/baidu/palo/analysis/JoinOperator.java @@ -87,6 +87,18 @@ public enum JoinOperator { public boolean isCrossJoin() { return this == CROSS_JOIN; } + + public boolean isFullOuterJoin() { + return this == FULL_OUTER_JOIN; + } + + public boolean isLeftOuterJoin() { + return this == LEFT_OUTER_JOIN; + } + + public boolean isRightOuterJoin() { + return this == RIGHT_OUTER_JOIN; + } } diff --git a/fe/src/com/baidu/palo/analysis/ModifyTablePropertiesClause.java b/fe/src/com/baidu/palo/analysis/ModifyTablePropertiesClause.java index c254115607..7159a91b37 100644 --- a/fe/src/com/baidu/palo/analysis/ModifyTablePropertiesClause.java +++ b/fe/src/com/baidu/palo/analysis/ModifyTablePropertiesClause.java @@ -21,16 +21,17 @@ package com.baidu.palo.analysis; import com.baidu.palo.common.AnalysisException; +import com.baidu.palo.common.ErrorCode; +import com.baidu.palo.common.ErrorReport; import com.baidu.palo.common.util.PrintableMap; -import com.google.common.base.Joiner; -import com.google.common.base.Joiner.MapJoiner; -import com.google.common.collect.Maps; - import java.util.Map; // clause which is used to modify table properties public class ModifyTablePropertiesClause extends AlterClause { + + private static final String KEY_STORAGE_TYPE = "storage_type"; + private Map properties; public ModifyTablePropertiesClause(Map properties) { @@ -39,9 +40,20 @@ public class ModifyTablePropertiesClause extends AlterClause { @Override public void analyze(Analyzer analyzer) throws AnalysisException { + if (!analyzer.getCatalog().getUserMgr().isAdmin(analyzer.getUser())) { + ErrorReport.reportAnalysisException(ErrorCode.ERR_SPECIFIC_ACCESS_DENIED_ERROR, + "Modify table property"); + } + if (properties == null || properties.isEmpty()) { throw new AnalysisException("Properties is not set"); } + + if (properties.containsKey(KEY_STORAGE_TYPE)) { + if (!properties.get(KEY_STORAGE_TYPE).equals("column")) { + throw new AnalysisException("Can only change storage type to COLUMN"); + } + } } @Override diff --git a/fe/src/com/baidu/palo/analysis/RandomDistributionDesc.java b/fe/src/com/baidu/palo/analysis/RandomDistributionDesc.java index fef82e5454..fbbe59dc39 100644 --- a/fe/src/com/baidu/palo/analysis/RandomDistributionDesc.java +++ b/fe/src/com/baidu/palo/analysis/RandomDistributionDesc.java @@ -20,11 +20,11 @@ package com.baidu.palo.analysis; -import com.baidu.palo.common.AnalysisException; import com.baidu.palo.catalog.Column; import com.baidu.palo.catalog.DistributionInfo; -import com.baidu.palo.catalog.RandomDistributionInfo; import com.baidu.palo.catalog.DistributionInfo.DistributionInfoType; +import com.baidu.palo.catalog.RandomDistributionInfo; +import com.baidu.palo.common.AnalysisException; import java.io.DataInput; import java.io.DataOutput; @@ -32,6 +32,7 @@ import java.io.IOException; import java.util.List; import java.util.Set; +@Deprecated public class RandomDistributionDesc extends DistributionDesc { int numBucket; @@ -46,7 +47,7 @@ public class RandomDistributionDesc extends DistributionDesc { @Override public void analyze(Set colSet) throws AnalysisException { - throw new AnalysisException("Random distribution is deprecated now. Use hash distribution instead."); + throw new AnalysisException("Random distribution is deprecated now, use Hash distribution instead"); } @Override @@ -66,7 +67,6 @@ public class RandomDistributionDesc extends DistributionDesc { @Override public void write(DataOutput out) throws IOException { super.write(out); - out.writeInt(numBucket); } diff --git a/fe/src/com/baidu/palo/analysis/SelectStmt.java b/fe/src/com/baidu/palo/analysis/SelectStmt.java index 22a07063b7..ec1c30c4e0 100644 --- a/fe/src/com/baidu/palo/analysis/SelectStmt.java +++ b/fe/src/com/baidu/palo/analysis/SelectStmt.java @@ -846,6 +846,9 @@ public class SelectStmt extends QueryStmt { // ii) Other DISTINCT aggregates are present. ExprSubstitutionMap countAllMap = createCountAllMap(aggExprs, analyzer); countAllMap = ExprSubstitutionMap.compose(ndvSmap, countAllMap, analyzer); + final ExprSubstitutionMap multiCountOrSumDistinctMap = + createSumOrCountMultiDistinctSMap(aggExprs, analyzer); + countAllMap = ExprSubstitutionMap.compose(multiCountOrSumDistinctMap, countAllMap, analyzer); List substitutedAggs = Expr.substituteList(aggExprs, countAllMap, analyzer, false); aggExprs.clear(); @@ -957,6 +960,54 @@ public class SelectStmt extends QueryStmt { return result; } + + /** + * Build smap count_distinct->multi_count_distinct sum_distinct->multi_count_distinct + * assumes that select list and having clause have been analyzed. + */ + private ExprSubstitutionMap createSumOrCountMultiDistinctSMap( + ArrayList aggExprs, Analyzer analyzer) throws AnalysisException { + final List distinctExprs = Lists.newArrayList(); + for (FunctionCallExpr aggExpr : aggExprs) { + if (aggExpr.isDistinct()) { + distinctExprs.add(aggExpr); + } + } + final ExprSubstitutionMap result = new ExprSubstitutionMap(); + final boolean hasMultiDistinct = AggregateInfo.estimateIfContainsMultiDistinct(distinctExprs); + if (!hasMultiDistinct) { + return result; + } + for (FunctionCallExpr inputExpr : distinctExprs) { + Expr replaceExpr = null; + final String functionName = inputExpr.getFnName().getFunction(); + if (functionName.equalsIgnoreCase("COUNT")) { + final List countInputExpr = Lists.newArrayList(inputExpr.getChild(0).clone(null)); + replaceExpr = new FunctionCallExpr("MULTI_DISTINCT_COUNT", + new FunctionParams(inputExpr.isDistinct(), countInputExpr)); + } else if (functionName.equalsIgnoreCase("SUM")) { + final List sumInputExprs = Lists.newArrayList(inputExpr.getChild(0).clone(null)); + replaceExpr = new FunctionCallExpr("MULTI_DISTINCT_SUM", + new FunctionParams(inputExpr.isDistinct(), sumInputExprs)); + } else if (functionName.equalsIgnoreCase("AVG")) { + final List sumInputExprs = Lists.newArrayList(inputExpr.getChild(0).clone(null)); + final List countInputExpr = Lists.newArrayList(inputExpr.getChild(0).clone(null)); + final FunctionCallExpr sumExpr = new FunctionCallExpr("MULTI_DISTINCT_SUM", + new FunctionParams(inputExpr.isDistinct(), sumInputExprs)); + final FunctionCallExpr countExpr = new FunctionCallExpr("MULTI_DISTINCT_COUNT", + new FunctionParams(inputExpr.isDistinct(), countInputExpr)); + replaceExpr = new ArithmeticExpr(ArithmeticExpr.Operator.DIVIDE, sumExpr, countExpr); + } else { + throw new AnalysisException(inputExpr.getFnName() + " can't support multi distinct."); + } + + replaceExpr.analyze(analyzer); + result.put(inputExpr, replaceExpr); + } + if (LOG.isDebugEnabled()) LOG.debug("multi distinct smap: {}", result.debugString()); + return result; + } + /** * Create a map from COUNT([ALL]) -> zeroifnull(COUNT([ALL])) if * i) There is no GROUP-BY, and diff --git a/fe/src/com/baidu/palo/analysis/ShowTabletStmt.java b/fe/src/com/baidu/palo/analysis/ShowTabletStmt.java index 0d39d7f60c..e2c0693969 100644 --- a/fe/src/com/baidu/palo/analysis/ShowTabletStmt.java +++ b/fe/src/com/baidu/palo/analysis/ShowTabletStmt.java @@ -107,6 +107,7 @@ public class ShowTabletStmt extends ShowStmt { builder.addColumn(new Column("PartitionId", ColumnType.createVarchar(30))); builder.addColumn(new Column("IndexId", ColumnType.createVarchar(30))); builder.addColumn(new Column("IsSync", ColumnType.createVarchar(30))); + builder.addColumn(new Column("DetailCmd", ColumnType.createVarchar(30))); } else { for (String title : TabletsProcDir.TITLE_NAMES) { builder.addColumn(new Column(title, ColumnType.createVarchar(30))); diff --git a/fe/src/com/baidu/palo/analysis/TableRef.java b/fe/src/com/baidu/palo/analysis/TableRef.java index 7b24966f6a..95f0e8c50a 100644 --- a/fe/src/com/baidu/palo/analysis/TableRef.java +++ b/fe/src/com/baidu/palo/analysis/TableRef.java @@ -20,26 +20,25 @@ package com.baidu.palo.analysis; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.Set; - import com.baidu.palo.catalog.Table; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; import com.baidu.palo.common.InternalException; -import com.baidu.palo.analysis.AnalyticExpr; import com.baidu.palo.rewrite.ExprRewriter; import com.google.common.base.Joiner; import com.google.common.base.Preconditions; import com.google.common.collect.Lists; - import com.google.common.collect.Sets; -import org.apache.logging.log4j.Logger; + import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Set; /** * Superclass of all table references, including references to views, base tables @@ -534,6 +533,9 @@ public class TableRef implements ParseNode { } StringBuilder output = new StringBuilder(" " + joinOpToSql() + " "); + if (joinHints != null && !joinHints.isEmpty()) { + output.append("[").append(Joiner.on(", ").join(joinHints)).append("] "); + } output.append(tableRefToSql()).append(" "); if (usingColNames != null) { output.append("USING (").append(Joiner.on(", ").join(usingColNames)).append(")"); diff --git a/fe/src/com/baidu/palo/analysis/TupleIsNullPredicate.java b/fe/src/com/baidu/palo/analysis/TupleIsNullPredicate.java index 0350d51db5..84fc5196b4 100644 --- a/fe/src/com/baidu/palo/analysis/TupleIsNullPredicate.java +++ b/fe/src/com/baidu/palo/analysis/TupleIsNullPredicate.java @@ -38,17 +38,17 @@ import java.util.List; */ public class TupleIsNullPredicate extends Predicate { - private final List tupleIds; + private final List tupleIds = Lists.newArrayList(); public TupleIsNullPredicate(List tupleIds) { Preconditions.checkState(tupleIds != null && !tupleIds.isEmpty()); - this.tupleIds = tupleIds; + this.tupleIds.addAll(tupleIds); } protected TupleIsNullPredicate(TupleIsNullPredicate other) { super(other); - tupleIds = other.tupleIds; - } + tupleIds.addAll(other.tupleIds); + } @Override protected void analyzeImpl(Analyzer analyzer) throws AnalysisException { diff --git a/fe/src/com/baidu/palo/catalog/AggregateFunction.java b/fe/src/com/baidu/palo/catalog/AggregateFunction.java index c4dd5c832a..96ca78866f 100644 --- a/fe/src/com/baidu/palo/catalog/AggregateFunction.java +++ b/fe/src/com/baidu/palo/catalog/AggregateFunction.java @@ -217,6 +217,7 @@ public class AggregateFunction extends Function { public TFunction toThrift() { TFunction fn = super.toThrift(); TAggregateFunction aggFn = new TAggregateFunction(); + aggFn.setIs_analytic_only_fn(isAnalyticFn && !isAggregateFn); aggFn.setUpdate_fn_symbol(updateFnSymbol); aggFn.setInit_fn_symbol(initFnSymbol); if (serializeFnSymbol != null) { diff --git a/fe/src/com/baidu/palo/catalog/Catalog.java b/fe/src/com/baidu/palo/catalog/Catalog.java index 9c9865611d..ef0c20e3f6 100644 --- a/fe/src/com/baidu/palo/catalog/Catalog.java +++ b/fe/src/com/baidu/palo/catalog/Catalog.java @@ -272,6 +272,8 @@ public class Catalog { private Checkpoint checkpointer; private Pair helperNode = null; private Pair selfNode = null; + private Pair selfHostname = null; + private List frontends; private List removedFrontends; @@ -407,7 +409,7 @@ public class Catalog { return CHECKPOINT; } - private static Catalog getCurrentCatalog() { + public static Catalog getCurrentCatalog() { if (isCheckpointThread()) { return CHECKPOINT; } else { @@ -691,6 +693,8 @@ public class Catalog { private void getSelfHostPort() { selfNode = new Pair(FrontendOptions.getLocalHostAddress(), Config.edit_log_port); + selfHostname = new Pair(FrontendOptions.getHostname(), Config.edit_log_port); + LOG.debug("get self node: {}, self hostname: {}", selfNode, selfHostname); } private void getHelperNode(String[] args) throws AnalysisException { @@ -803,6 +807,16 @@ public class Catalog { System.out.println(msg); LOG.info(msg); + // MUST set master ip before starting checkpoint thread. + // because checkpoint thread need this info to select non-master FE to push image + this.masterIp = FrontendOptions.getLocalHostAddress(); + this.masterRpcPort = Config.rpc_port; + this.masterHttpPort = Config.http_port; + + MasterInfo info = new MasterInfo(this.masterIp, this.masterHttpPort, this.masterRpcPort); + editLog.logMasterInfo(info); + + // start checkpoint thread checkpointer = new Checkpoint(editLog); checkpointer.setName("leaderCheckpointer"); checkpointer.setInterval(FeConstants.checkpoint_interval_second * 1000L); @@ -842,16 +856,6 @@ public class Catalog { // catalog recycle bin getRecycleBin().start(); - this.masterIp = FrontendOptions.getLocalHostAddress(); - this.masterRpcPort = Config.rpc_port; - this.masterHttpPort = Config.http_port; - - MasterInfo info = new MasterInfo(); - info.setIp(masterIp); - info.setRpcPort(masterRpcPort); - info.setHttpPort(masterHttpPort); - editLog.logMasterInfo(info); - createTimePrinter(); timePrinter.setName("timePrinter"); long tsInterval = (long) ((Config.meta_delay_toleration_second / 2.0) * 1000L); @@ -1861,6 +1865,7 @@ public class Catalog { switch (feType) { case UNKNOWN: { transferToNonMaster(); + break; } default: } @@ -3023,18 +3028,6 @@ public class Catalog { throw new DdlException(e.getMessage()); } - // check storage type if has null column - boolean hasNullColumn = false; - for (Column column : baseSchema) { - if (column.isAllowNull()) { - hasNullColumn = true; - break; - } - } - if (hasNullColumn && baseIndexStorageType != TStorageType.COLUMN) { - throw new DdlException("Only column table support null columns"); - } - Preconditions.checkNotNull(baseIndexStorageType); long baseIndexId = olapTable.getId(); olapTable.setStorageTypeToIndex(baseIndexId, baseIndexStorageType); @@ -3044,9 +3037,6 @@ public class Catalog { double bfFpp = 0; try { bfColumns = PropertyAnalyzer.analyzeBloomFilterColumns(properties, baseSchema); - if (bfColumns != null && baseIndexStorageType == TStorageType.ROW) { - throw new DdlException("Only column table support bloom filter index"); - } if (bfColumns != null && bfColumns.isEmpty()) { bfColumns = null; } @@ -4078,6 +4068,10 @@ public class Catalog { return this.selfNode; } + public Pair getSelfHostname() { + return this.selfHostname; + } + public FrontendNodeType getFeType() { return this.feType; } diff --git a/fe/src/com/baidu/palo/catalog/DomainResolverServer.java b/fe/src/com/baidu/palo/catalog/DomainResolverServer.java index ca52631fcc..085c275035 100644 --- a/fe/src/com/baidu/palo/catalog/DomainResolverServer.java +++ b/fe/src/com/baidu/palo/catalog/DomainResolverServer.java @@ -65,9 +65,11 @@ public final class DomainResolverServer { } public static DomainResolverServer getInstance() { - synchronized (DomainResolverServer.class) { - if (instance == null) { - instance = new DomainResolverServer(); + if (instance == null) { + synchronized (DomainResolverServer.class) { + if (instance == null) { + instance = new DomainResolverServer(); + } } } return instance; @@ -383,7 +385,7 @@ public final class DomainResolverServer { cloneLock.unlock(); return copyMaps; } - + // Resolve domain name at intervals, when new domain name are registered // calling register() , server will immediately start a new asynchronous // resolvation. diff --git a/fe/src/com/baidu/palo/catalog/FunctionSet.java b/fe/src/com/baidu/palo/catalog/FunctionSet.java index 72c2b0f45e..af4f4903a9 100644 --- a/fe/src/com/baidu/palo/catalog/FunctionSet.java +++ b/fe/src/com/baidu/palo/catalog/FunctionSet.java @@ -127,7 +127,122 @@ public class FunctionSet { "3maxIN8palo_udf10DecimalValEEEvPNS2_15FunctionContextERKT_PS6_") .put(Type.LARGEINT, "3maxIN8palo_udf11LargeIntValEEEvPNS2_15FunctionContextERKT_PS6_") - .build(); + .build(); + + private static final Map MULTI_DISTINCT_SUM_RETURN_TYPE = + ImmutableMap.builder() + .put(Type.TINYINT, Type.BIGINT) + .put(Type.SMALLINT, Type.BIGINT) + .put(Type.INT, Type.BIGINT) + .put(Type.BIGINT, Type.BIGINT) + .put(Type.FLOAT, Type.DOUBLE) + .put(Type.DOUBLE, Type.DOUBLE) + .put(Type.LARGEINT, Type.LARGEINT) + .put(Type.DECIMAL, Type.DECIMAL) + .build(); + + private static final Map MULTI_DISTINCT_INIT_SYMBOL = + ImmutableMap.builder() + .put(Type.TINYINT, + "34count_or_sum_distinct_numeric_initIN8palo_udf10TinyIntValEEEvPNS2_15FunctionContextEPNS2_9StringValE") + .put(Type.SMALLINT, + "34count_or_sum_distinct_numeric_initIN8palo_udf11SmallIntValEEEvPNS2_15FunctionContextEPNS2_9StringValE") + .put(Type.INT, + "34count_or_sum_distinct_numeric_initIN8palo_udf6IntValEEEvPNS2_15FunctionContextEPNS2_9StringValE") + .put(Type.BIGINT, + "34count_or_sum_distinct_numeric_initIN8palo_udf9BigIntValEEEvPNS2_15FunctionContextEPNS2_9StringValE") + .put(Type.FLOAT, + "34count_or_sum_distinct_numeric_initIN8palo_udf8FloatValEEEvPNS2_15FunctionContextEPNS2_9StringValE") + .put(Type.DOUBLE, + "34count_or_sum_distinct_numeric_initIN8palo_udf9DoubleValEEEvPNS2_15FunctionContextEPNS2_9StringValE") + .put(Type.LARGEINT, + "34count_or_sum_distinct_numeric_initIN8palo_udf11LargeIntValEEEvPNS2_15FunctionContextEPNS2_9StringValE") + .build(); + + private static final Map MULTI_DISTINCT_UPDATE_SYMBOL = + ImmutableMap.builder() + .put(Type.TINYINT, + "36count_or_sum_distinct_numeric_updateIN8palo_udf10TinyIntValEEEvPNS2_15FunctionContextERT_PNS2_9StringValE") + .put(Type.SMALLINT, + "36count_or_sum_distinct_numeric_updateIN8palo_udf11SmallIntValEEEvPNS2_15FunctionContextERT_PNS2_9StringValE") + .put(Type.INT, + "36count_or_sum_distinct_numeric_updateIN8palo_udf6IntValEEEvPNS2_15FunctionContextERT_PNS2_9StringValE") + .put(Type.BIGINT, + "36count_or_sum_distinct_numeric_updateIN8palo_udf9BigIntValEEEvPNS2_15FunctionContextERT_PNS2_9StringValE") + .put(Type.FLOAT, + "36count_or_sum_distinct_numeric_updateIN8palo_udf8FloatValEEEvPNS2_15FunctionContextERT_PNS2_9StringValE") + .put(Type.DOUBLE, + "36count_or_sum_distinct_numeric_updateIN8palo_udf9DoubleValEEEvPNS2_15FunctionContextERT_PNS2_9StringValE") + .put(Type.LARGEINT, + "36count_or_sum_distinct_numeric_updateIN8palo_udf11LargeIntValEEEvPNS2_15FunctionContextERT_PNS2_9StringValE") + .build(); + + private static final Map MULTI_DISTINCT_MERGE_SYMBOL = + ImmutableMap.builder() + .put(Type.TINYINT, + "35count_or_sum_distinct_numeric_mergeIN8palo_udf10TinyIntValEEEvPNS2_15FunctionContextERNS2_9StringValEPS6_") + .put(Type.SMALLINT, + "35count_or_sum_distinct_numeric_mergeIN8palo_udf11SmallIntValEEEvPNS2_15FunctionContextERNS2_9StringValEPS6_") + .put(Type.INT, + "35count_or_sum_distinct_numeric_mergeIN8palo_udf6IntValEEEvPNS2_15FunctionContextERNS2_9StringValEPS6_") + .put(Type.BIGINT, + "35count_or_sum_distinct_numeric_mergeIN8palo_udf9BigIntValEEEvPNS2_15FunctionContextERNS2_9StringValEPS6_") + .put(Type.FLOAT, + "35count_or_sum_distinct_numeric_mergeIN8palo_udf8FloatValEEEvPNS2_15FunctionContextERNS2_9StringValEPS6_") + .put(Type.DOUBLE, + "35count_or_sum_distinct_numeric_mergeIN8palo_udf9DoubleValEEEvPNS2_15FunctionContextERNS2_9StringValEPS6_") + .put(Type.LARGEINT, + "35count_or_sum_distinct_numeric_mergeIN8palo_udf11LargeIntValEEEvPNS2_15FunctionContextERNS2_9StringValEPS6_") + .build(); + + private static final Map MULTI_DISTINCT_SERIALIZE_SYMBOL = + ImmutableMap.builder() + .put(Type.TINYINT, + "39count_or_sum_distinct_numeric_serializeIN8palo_udf10TinyIntValEEENS2_9StringValEPNS2_15FunctionContextERKS4_") + .put(Type.SMALLINT, + "39count_or_sum_distinct_numeric_serializeIN8palo_udf11SmallIntValEEENS2_9StringValEPNS2_15FunctionContextERKS4_") + .put(Type.INT, + "39count_or_sum_distinct_numeric_serializeIN8palo_udf6IntValEEENS2_9StringValEPNS2_15FunctionContextERKS4_") + .put(Type.BIGINT, + "39count_or_sum_distinct_numeric_serializeIN8palo_udf9BigIntValEEENS2_9StringValEPNS2_15FunctionContextERKS4_") + .put(Type.FLOAT, + "39count_or_sum_distinct_numeric_serializeIN8palo_udf8FloatValEEENS2_9StringValEPNS2_15FunctionContextERKS4_") + .put(Type.DOUBLE, + "39count_or_sum_distinct_numeric_serializeIN8palo_udf9DoubleValEEENS2_9StringValEPNS2_15FunctionContextERKS4_") + .put(Type.LARGEINT, + "39count_or_sum_distinct_numeric_serializeIN8palo_udf11LargeIntValEEENS2_9StringValEPNS2_15FunctionContextERKS4_") + .build(); + + private static final Map MULTI_DISTINCT_COUNT_FINALIZE_SYMBOL = + ImmutableMap.builder() + .put(Type.TINYINT, + "38count_or_sum_distinct_numeric_finalizeIN8palo_udf10TinyIntValEEENS2_9BigIntValEPNS2_15FunctionContextERKNS2_9StringValE") + .put(Type.SMALLINT, + "38count_or_sum_distinct_numeric_finalizeIN8palo_udf11SmallIntValEEENS2_9BigIntValEPNS2_15FunctionContextERKNS2_9StringValE") + .put(Type.INT, + "38count_or_sum_distinct_numeric_finalizeIN8palo_udf8FloatValEEENS2_9BigIntValEPNS2_15FunctionContextERKNS2_9StringValE") + .put(Type.BIGINT, + "38count_or_sum_distinct_numeric_finalizeIN8palo_udf9BigIntValEEES3_PNS2_15FunctionContextERKNS2_9StringValE") + .put(Type.FLOAT, + "38count_or_sum_distinct_numeric_finalizeIN8palo_udf8FloatValEEENS2_9BigIntValEPNS2_15FunctionContextERKNS2_9StringValE") + .put(Type.DOUBLE, + "38count_or_sum_distinct_numeric_finalizeIN8palo_udf9DoubleValEEENS2_9BigIntValEPNS2_15FunctionContextERKNS2_9StringValE") + .put(Type.LARGEINT, + "38count_or_sum_distinct_numeric_finalizeIN8palo_udf11LargeIntValEEENS2_9BigIntValEPNS2_15FunctionContextERKNS2_9StringValE") + .build(); + + + private static final Map MULTI_DISTINCT_SUM_FINALIZE_SYMBOL = + ImmutableMap.builder() + .put(Type.BIGINT, + "28sum_distinct_bigint_finalizeIN8palo_udf9BigIntValEEES3_PNS2_15FunctionContextERKNS2_9StringValE") + .put(Type.FLOAT, + "28sum_distinct_double_finalizeIN8palo_udf9DoubleValEEES3_PNS2_15FunctionContextERKNS2_9StringValE") + .put(Type.DOUBLE, + "28sum_distinct_double_finalizeIN8palo_udf9DoubleValEEES3_PNS2_15FunctionContextERKNS2_9StringValE") + .put(Type.LARGEINT, + "30sum_distinct_largeint_finalizeIN8palo_udf11LargeIntValEEES3_PNS2_15FunctionContextERKNS2_9StringValE") + .build(); private static final Map STDDEV_UPDATE_SYMBOL = ImmutableMap.builder() @@ -536,6 +651,86 @@ public class FunctionSet { prefix + "12count_removeEPN8palo_udf15FunctionContextERKNS1_6AnyValEPNS1_9BigIntValE", null, false, true, true)); + + // count in multi distinct + if (t == Type.CHAR || t == Type.VARCHAR) { + addBuiltin(AggregateFunction.createBuiltin("multi_distinct_count", Lists.newArrayList(t), + Type.BIGINT, + Type.VARCHAR, + prefix + "26count_distinct_string_initEPN8palo_udf15FunctionContextEPNS1_9StringValE", + prefix + "28count_distinct_string_updateEPN8palo_udf15FunctionContextERNS1_9StringValEPS4_", + prefix + "27count_distinct_string_mergeEPN8palo_udf15FunctionContextERNS1_9StringValEPS4_", + prefix + "31count_distinct_string_serializeEPN8palo_udf15FunctionContextERKNS1_9StringValE", + null, + null, + prefix + "30count_distinct_string_finalizeEPN8palo_udf15FunctionContextERKNS1_9StringValE", + false, true, true)); + + } else if (t == Type.TINYINT || t == Type.SMALLINT || t == Type.INT + || t == Type.BIGINT || t == Type.LARGEINT || t == Type.DOUBLE) { + addBuiltin(AggregateFunction.createBuiltin("multi_distinct_count", Lists.newArrayList(t), + Type.BIGINT, + Type.VARCHAR, + prefix + MULTI_DISTINCT_INIT_SYMBOL.get(t), + prefix + MULTI_DISTINCT_UPDATE_SYMBOL.get(t), + prefix + MULTI_DISTINCT_MERGE_SYMBOL.get(t), + prefix + MULTI_DISTINCT_SERIALIZE_SYMBOL.get(t), + null, + null, + prefix + MULTI_DISTINCT_COUNT_FINALIZE_SYMBOL.get(t), + false, true, true)); + } else if (t == Type.DATE || t == Type.DATETIME) { + addBuiltin(AggregateFunction.createBuiltin("multi_distinct_count", Lists.newArrayList(t), + Type.BIGINT, + Type.VARCHAR, + prefix + "24count_distinct_date_initEPN8palo_udf15FunctionContextEPNS1_9StringValE", + prefix + "26count_distinct_date_updateEPN8palo_udf15FunctionContextERNS1_11DateTimeValEPNS1_9StringValE", + prefix + "25count_distinct_date_mergeEPN8palo_udf15FunctionContextERNS1_9StringValEPS4_", + prefix + "29count_distinct_date_serializeEPN8palo_udf15FunctionContextERKNS1_9StringValE", + null, + null, + prefix + "28count_distinct_date_finalizeEPN8palo_udf15FunctionContextERKNS1_9StringValE", + false, true, true)); + } else if (t == Type.DECIMAL) { + addBuiltin(AggregateFunction.createBuiltin("multi_distinct_count", Lists.newArrayList(t), + Type.BIGINT, + Type.VARCHAR, + prefix + "34count_or_sum_distinct_decimal_initEPN8palo_udf15FunctionContextEPNS1_9StringValE", + prefix + "36count_or_sum_distinct_decimal_updateEPN8palo_udf15FunctionContextERNS1_10DecimalValEPNS1_9StringValE", + prefix + "35count_or_sum_distinct_decimal_mergeEPN8palo_udf15FunctionContextERNS1_9StringValEPS4_", + prefix + "39count_or_sum_distinct_decimal_serializeEPN8palo_udf15FunctionContextERKNS1_9StringValE", + null, + null, + prefix + "31count_distinct_decimal_finalizeEPN8palo_udf15FunctionContextERKNS1_9StringValE", + false, true, true)); + } + + // sum in multi distinct + if (t == Type.BIGINT || t == Type.LARGEINT || t == Type.DOUBLE) { + addBuiltin(AggregateFunction.createBuiltin("multi_distinct_sum", Lists.newArrayList(t), + t, + Type.VARCHAR, + prefix + MULTI_DISTINCT_INIT_SYMBOL.get(t), + prefix + MULTI_DISTINCT_UPDATE_SYMBOL.get(t), + prefix + MULTI_DISTINCT_MERGE_SYMBOL.get(t), + prefix + MULTI_DISTINCT_SERIALIZE_SYMBOL.get(t), + null, + null, + prefix + MULTI_DISTINCT_SUM_FINALIZE_SYMBOL.get(t), + false, true, true)); + } else if (t == Type.DECIMAL) { + addBuiltin(AggregateFunction.createBuiltin("multi_distinct_sum", Lists.newArrayList(t), + MULTI_DISTINCT_SUM_RETURN_TYPE.get(t), + Type.VARCHAR, + prefix + "34count_or_sum_distinct_decimal_initEPN8palo_udf15FunctionContextEPNS1_9StringValE", + prefix + "36count_or_sum_distinct_decimal_updateEPN8palo_udf15FunctionContextERNS1_10DecimalValEPNS1_9StringValE", + prefix + "35count_or_sum_distinct_decimal_mergeEPN8palo_udf15FunctionContextERNS1_9StringValEPS4_", + prefix + "39count_or_sum_distinct_decimal_serializeEPN8palo_udf15FunctionContextERKNS1_9StringValE", + null, + null, + prefix + "29sum_distinct_decimal_finalizeEPN8palo_udf15FunctionContextERKNS1_9StringValE", + false, true, true)); + } // Min String minMaxInit = t.isStringType() ? initNullString : initNull; String minMaxSerializeOrFinalize = t.isStringType() ? stringValSerializeOrFinalize : null; diff --git a/fe/src/com/baidu/palo/catalog/OlapTable.java b/fe/src/com/baidu/palo/catalog/OlapTable.java index e099eb174c..87faba8e83 100644 --- a/fe/src/com/baidu/palo/catalog/OlapTable.java +++ b/fe/src/com/baidu/palo/catalog/OlapTable.java @@ -185,7 +185,12 @@ public class OlapTable extends Table { indexIdToSchema.put(indexId, schema); indexIdToSchemaVersion.put(indexId, schemaVersion); indexIdToSchemaHash.put(indexId, schemaHash); - indexIdToShortKeyColumnCount.put(indexId, shortKeyColumnCount); + indexIdToShortKeyColumnCount.put(indexId, shortKeyColumnCount); + } + + public void setIndexStorageType(Long indexId, TStorageType newStorageType) { + Preconditions.checkState(newStorageType == TStorageType.COLUMN); + indexIdToStorageType.put(indexId, newStorageType); } public void deleteIndexInfo(String indexName) { @@ -375,14 +380,6 @@ public class OlapTable extends Table { } public Partition getPartition(String partitionName) { - if (!nameToPartition.containsKey(partitionName)) { - LOG.info("partition size: {}", nameToPartition.size()); - for (Map.Entry entry : nameToPartition.entrySet()) { - LOG.info("print partition id: {}, key name: {}, partition name: {}", - entry.getValue().getId(), entry.getKey(), entry.getValue().getName()); - - } - } return nameToPartition.get(partitionName); } diff --git a/fe/src/com/baidu/palo/clone/Clone.java b/fe/src/com/baidu/palo/clone/Clone.java index 9e4ebe366f..15441d00e2 100644 --- a/fe/src/com/baidu/palo/clone/Clone.java +++ b/fe/src/com/baidu/palo/clone/Clone.java @@ -559,6 +559,10 @@ public class Clone { } Replica replica = tablet.getReplicaByBackendId(backendId); + if (replica == null) { + throw new MetaNotFoundException("replica does not exist in be: " + backendId + + " . tablet id: " + tabletId); + } if (replica.getState() == ReplicaState.CLONE) { if (tablet.deleteReplicaByBackendId(backendId)) { LOG.info("remove clone replica. tablet id: {}, backend id: {}", tabletId, backendId); diff --git a/fe/src/com/baidu/palo/common/CommandLineOptions.java b/fe/src/com/baidu/palo/common/CommandLineOptions.java new file mode 100644 index 0000000000..4c56c4983c --- /dev/null +++ b/fe/src/com/baidu/palo/common/CommandLineOptions.java @@ -0,0 +1,48 @@ +package com.baidu.palo.common; + +import com.baidu.palo.journal.bdbje.BDBToolOptions; + +public class CommandLineOptions { + + private boolean isVersion; + private String helperNode; + private boolean runBdbTools; + private BDBToolOptions bdbToolOpts = null; + + public CommandLineOptions(boolean isVersion, String helperNode, BDBToolOptions bdbToolOptions) { + this.isVersion = isVersion; + this.helperNode = helperNode; + this.bdbToolOpts = bdbToolOptions; + if (this.bdbToolOpts != null) { + runBdbTools = true; + } else { + runBdbTools = false; + } + } + + public boolean isVersion() { + return isVersion; + } + + public String getHelperNode() { + return helperNode; + } + + public boolean runBdbTools() { + return runBdbTools; + } + + public BDBToolOptions getBdbToolOpts() { + return bdbToolOpts; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("print version: " + isVersion).append("\n"); + sb.append("helper node: " + helperNode).append("\n"); + sb.append("bdb tool options: " + bdbToolOpts).append("\n"); + return sb.toString(); + } + +} diff --git a/fe/src/com/baidu/palo/common/Config.java b/fe/src/com/baidu/palo/common/Config.java index 55d3f12df9..26835db800 100644 --- a/fe/src/com/baidu/palo/common/Config.java +++ b/fe/src/com/baidu/palo/common/Config.java @@ -55,8 +55,7 @@ public class Config extends ConfigBase { */ @ConfField public static String[] audit_log_modules = {"slow_query", "query"}; @ConfField public static String audit_log_roll_mode = "TIME-DAY"; // TIME-DAY, TIME-HOUR, SIZE-MB-nnn - @ConfField - public static int audit_log_roll_num = 10; // Doesn't work if roll mode is TIME-* + @ConfField public static int audit_log_roll_num = 10; // Doesn't work if roll mode is TIME-* /* * Labels of finished or cancelled load jobs will be removed after *label_keep_max_second* @@ -485,7 +484,7 @@ public class Config extends ConfigBase { // for forward compatibility, will be removed later. // check token when download image file. @ConfField public static boolean enable_token_check = true; - + /* * Set to true if you deploy Palo using thirdparty deploy manager * Valid options are: diff --git a/fe/src/com/baidu/palo/common/FeConstants.java b/fe/src/com/baidu/palo/common/FeConstants.java index 06a57f0d3f..0a86cc4159 100644 --- a/fe/src/com/baidu/palo/common/FeConstants.java +++ b/fe/src/com/baidu/palo/common/FeConstants.java @@ -38,5 +38,5 @@ public class FeConstants { // general model // Current meta data version. Use this version to write journals and image - public static int meta_version = FeMetaVersion.VERSION_37; + public static int meta_version = FeMetaVersion.VERSION_40; } diff --git a/fe/src/com/baidu/palo/common/FeMetaVersion.java b/fe/src/com/baidu/palo/common/FeMetaVersion.java index 46a6e83afe..29602ba54e 100644 --- a/fe/src/com/baidu/palo/common/FeMetaVersion.java +++ b/fe/src/com/baidu/palo/common/FeMetaVersion.java @@ -79,4 +79,13 @@ public final class FeMetaVersion { // added collation_server to variables (palo-3059) public static final int VERSION_37 = 37; + + // paralle exec param and batch size + public static final int VERSION_38 = 38; + + // schema change support row to column + public static final int VERSION_39 = 39; + + // persistent brpc port in Backend + public static final int VERSION_40 = 40; } diff --git a/fe/src/com/baidu/palo/common/proc/BackendsProcDir.java b/fe/src/com/baidu/palo/common/proc/BackendsProcDir.java index caefa5ce3f..f107af01d3 100644 --- a/fe/src/com/baidu/palo/common/proc/BackendsProcDir.java +++ b/fe/src/com/baidu/palo/common/proc/BackendsProcDir.java @@ -18,8 +18,8 @@ // specific language governing permissions and limitations // under the License. -package com.baidu.palo.common.proc; - +package com.baidu.palo.common.proc; + import com.baidu.palo.alter.DecommissionBackendJob.DecommissionType; import com.baidu.palo.catalog.Catalog; import com.baidu.palo.cluster.Cluster; @@ -39,23 +39,23 @@ import java.net.UnknownHostException; import java.util.ArrayList; import java.util.Collections; import java.util.LinkedList; -import java.util.List; +import java.util.List; public class BackendsProcDir implements ProcDirInterface { public static final ImmutableList TITLE_NAMES = new ImmutableList.Builder() .add("BackendId").add("Cluster").add("IP").add("HostName").add("HeartbeatPort") - .add("BePort").add("HttpPort").add("LastStartTime").add("LastHeartbeat").add("Alive") + .add("BePort").add("HttpPort").add("brpcPort").add("LastStartTime").add("LastHeartbeat").add("Alive") .add("SystemDecommissioned").add("ClusterDecommissioned").add("TabletNum").add("FreeSpace") .build(); public static final int IP_INDEX = 2; - public static final int HOSTNAME_INDEX = 3; - - private SystemInfoService clusterInfoService; - - public BackendsProcDir(SystemInfoService clusterInfoService) { - this.clusterInfoService = clusterInfoService; - } + public static final int HOSTNAME_INDEX = 3; + + private SystemInfoService clusterInfoService; + + public BackendsProcDir(SystemInfoService clusterInfoService) { + this.clusterInfoService = clusterInfoService; + } @Override public ProcResult fetchResult() throws AnalysisException { @@ -131,6 +131,7 @@ public class BackendsProcDir implements ProcDirInterface { backendInfo.add(String.valueOf(backend.getHeartbeatPort())); backendInfo.add(String.valueOf(backend.getBePort())); backendInfo.add(String.valueOf(backend.getHttpPort())); + backendInfo.add(String.valueOf(backend.getBrpcPort())); } backendInfo.add(TimeUtils.longToTimeString(backend.getLastStartTime())); backendInfo.add(TimeUtils.longToTimeString(backend.getLastUpdateMs())); @@ -174,30 +175,30 @@ public class BackendsProcDir implements ProcDirInterface { return backendInfos; } - @Override - public boolean register(String name, ProcNodeInterface node) { - return false; - } - - @Override - public ProcNodeInterface lookup(String beIdStr) throws AnalysisException { - if (Strings.isNullOrEmpty(beIdStr)) { - throw new AnalysisException("Backend id is null"); - } - - long backendId = -1L; - try { - backendId = Long.valueOf(beIdStr); - } catch (NumberFormatException e) { - throw new AnalysisException("Invalid backend id format: " + beIdStr); - } - - Backend backend = clusterInfoService.getBackend(backendId); - if (backend == null) { - throw new AnalysisException("Backend[" + backendId + "] does not exist."); - } - - return new BackendProcNode(backend); - } - -} + @Override + public boolean register(String name, ProcNodeInterface node) { + return false; + } + + @Override + public ProcNodeInterface lookup(String beIdStr) throws AnalysisException { + if (Strings.isNullOrEmpty(beIdStr)) { + throw new AnalysisException("Backend id is null"); + } + + long backendId = -1L; + try { + backendId = Long.valueOf(beIdStr); + } catch (NumberFormatException e) { + throw new AnalysisException("Invalid backend id format: " + beIdStr); + } + + Backend backend = clusterInfoService.getBackend(backendId); + if (backend == null) { + throw new AnalysisException("Backend[" + backendId + "] does not exist."); + } + + return new BackendProcNode(backend); + } + +} diff --git a/fe/src/com/baidu/palo/common/util/PropertyAnalyzer.java b/fe/src/com/baidu/palo/common/util/PropertyAnalyzer.java index f14828b25e..dbf934deba 100644 --- a/fe/src/com/baidu/palo/common/util/PropertyAnalyzer.java +++ b/fe/src/com/baidu/palo/common/util/PropertyAnalyzer.java @@ -190,7 +190,7 @@ public class PropertyAnalyzer { } public static TStorageType analyzeStorageType(Map properties) throws AnalysisException { - // only COLUMN is allowed now + // default is COLUMN TStorageType tStorageType = TStorageType.COLUMN; if (properties != null && properties.containsKey(PROPERTIES_STORAGE_TYPE)) { String storageType = properties.get(PROPERTIES_STORAGE_TYPE); diff --git a/fe/src/com/baidu/palo/common/util/Util.java b/fe/src/com/baidu/palo/common/util/Util.java index 3686bfcf0c..086c18a1a7 100644 --- a/fe/src/com/baidu/palo/common/util/Util.java +++ b/fe/src/com/baidu/palo/common/util/Util.java @@ -22,6 +22,7 @@ package com.baidu.palo.common.util; import com.baidu.palo.catalog.Column; import com.baidu.palo.catalog.PrimitiveType; +import com.baidu.palo.common.Config; import com.google.common.collect.Lists; @@ -132,7 +133,7 @@ public class Util { CommandResult result = new CommandResult(); List cmdList = shellSplit(cmd); String[] cmds = cmdList.toArray(new String[0]); - + try { Process p = Runtime.getRuntime().exec(cmds, envp); CmdWorker cmdWorker = new CmdWorker(p); diff --git a/fe/src/com/baidu/palo/deploy/DeployManager.java b/fe/src/com/baidu/palo/deploy/DeployManager.java index e5a54d959e..de26eb471e 100644 --- a/fe/src/com/baidu/palo/deploy/DeployManager.java +++ b/fe/src/com/baidu/palo/deploy/DeployManager.java @@ -115,7 +115,7 @@ public class DeployManager extends Daemon { protected static final Integer MAX_MISSING_TIME = 3; public DeployManager(Catalog catalog, long intervalMs) { - super("deplotManager", intervalMs); + super("deployManager", intervalMs); this.catalog = catalog; } @@ -130,10 +130,10 @@ public class DeployManager extends Daemon { this.backendServiceGroup = Strings.nullToEmpty(System.getenv(envBackendServiceGroup)); this.brokerServiceGroup = Strings.nullToEmpty(System.getenv(envBrokerServiceGroup)); - LOG.info("get ambari env: {}, {}, {}, {}", envElectableFeServiceGroup, envObserverFeServiceGroup, + LOG.info("get deploy env: {}, {}, {}, {}", envElectableFeServiceGroup, envObserverFeServiceGroup, envBackendServiceGroup, envBrokerServiceGroup); - // electableFeServiceGroup and backendServiceGroup + // electableFeServiceGroup and backendServiceGroup must exist if (Strings.isNullOrEmpty(electableFeServiceGroup) || Strings.isNullOrEmpty(backendServiceGroup)) { LOG.warn("failed to init service group name." + " electableFeServiceGroup: {}, backendServiceGroup: {}", @@ -320,7 +320,7 @@ public class DeployManager extends Daemon { if (selfHost == null) { // The running of this deploy manager means this node is considered self as Master. // If it self does not exist in electable fe service group, it should shut it self down. - LOG.warn("Self host {} is not in electable fe service group {}. Exit now.", + LOG.warn("self host {} is not in electable fe service group {}. Exit now.", selfHost, electableFeServiceGroup); System.exit(-1); } @@ -329,7 +329,9 @@ public class DeployManager extends Daemon { List localElectableFeAddrs = catalog.getFrontends(FrontendNodeType.FOLLOWER); List> localElectableFeHosts = convertToHostPortPair(localElectableFeAddrs); LOG.debug("get local electable hosts: {}", localElectableFeHosts); - inspectNodeChange(remoteElectableFeHosts, localElectableFeHosts, NodeType.ELECTABLE); + if (inspectNodeChange(remoteElectableFeHosts, localElectableFeHosts, NodeType.ELECTABLE)) { + return; + } // 2. Check the backend service group BE_BLOCK: { @@ -344,7 +346,9 @@ public class DeployManager extends Daemon { localBackendHosts.add(Pair.create(backend.getHost(), backend.getHeartbeatPort())); } LOG.debug("get local backend addrs: {}", localBackendHosts); - inspectNodeChange(remoteBackendHosts, localBackendHosts, NodeType.BACKEND); + if (inspectNodeChange(remoteBackendHosts, localBackendHosts, NodeType.BACKEND)) { + return; + } } if (hasObserverService) { @@ -358,7 +362,9 @@ public class DeployManager extends Daemon { List localObserverFeAddrs = catalog.getFrontends(FrontendNodeType.OBSERVER); List> localObserverFeHosts = convertToHostPortPair(localObserverFeAddrs); LOG.debug("get local observer fe hosts: {}", localObserverFeHosts); - inspectNodeChange(remoteObserverFeHosts, localObserverFeHosts, NodeType.OBSERVER); + if (inspectNodeChange(remoteObserverFeHosts, localObserverFeHosts, NodeType.OBSERVER)) { + return; + } } } @@ -461,9 +467,10 @@ public class DeployManager extends Daemon { * 1. Check if there are some nodes need to be dropped. * 2. Check if there are some nodes need to be added. * - * We only handle one change at a time + * We only handle one change at a time. + * Return true if something changed */ - private void inspectNodeChange(List> remoteHosts, + private boolean inspectNodeChange(List> remoteHosts, List> localHosts, NodeType nodeType) { @@ -476,7 +483,7 @@ public class DeployManager extends Daemon { // Double check if is it self if (isSelf(localIp, localPort)) { // This is it self. Shut down now. - LOG.error("Self host {}:{} does not exist in remote hosts. Showdown."); + LOG.error("self host {}:{} does not exist in remote hosts. Showdown."); System.exit(-1); } @@ -486,14 +493,14 @@ public class DeployManager extends Daemon { LOG.warn("downtime of {} node: {} detected times: 1", nodeType.name(), localHost); counterMap.put(localHost.toString(), 1); - return; + return false; } else { int times = counterMap.get(localHost.toString()); if (times < MAX_MISSING_TIME) { LOG.warn("downtime of {} node: {} detected times: {}", nodeType.name(), localHost, times + 1); counterMap.put(localHost.toString(), times + 1); - return; + return false; } else { // Reset the counter map and do the dropping operation LOG.warn("downtime of {} node: {} detected times: {}. drop it", @@ -502,6 +509,11 @@ public class DeployManager extends Daemon { } } + if (true) { + // TODO(cmy): For now, Deploy Manager dose not handle shrinking operations + continue; + } + // Can not find local host from remote host list, // which means this node should be dropped. try { @@ -520,11 +532,12 @@ public class DeployManager extends Daemon { } } catch (DdlException e) { LOG.error("Failed to drop {} node: {}:{}", nodeType, localIp, localPort, e); - return; + // return true is a conservative behavior. we do not expect any exception here. + return true; } LOG.info("Finished to drop {} node: {}:{}", nodeType, localIp, localPort); - return; + return true; } } @@ -554,13 +567,15 @@ public class DeployManager extends Daemon { } } catch (DdlException e) { LOG.error("Failed to add {} node: {}:{}", nodeType, remoteIp, remotePort, e); - return; + return true; } LOG.info("Finished to add {} node: {}:{}", nodeType, remoteIp, remotePort); - return; + return true; } } + + return false; } // Get host port pair from pair list. Return null if not found @@ -600,3 +615,4 @@ public class DeployManager extends Daemon { } } } + diff --git a/fe/src/com/baidu/palo/deploy/impl/K8sDeployManager.java b/fe/src/com/baidu/palo/deploy/impl/K8sDeployManager.java index cc10b2293f..189fe51f42 100644 --- a/fe/src/com/baidu/palo/deploy/impl/K8sDeployManager.java +++ b/fe/src/com/baidu/palo/deploy/impl/K8sDeployManager.java @@ -8,11 +8,13 @@ import com.baidu.palo.deploy.DeployManager; import com.google.common.base.Preconditions; import com.google.common.base.Strings; import com.google.common.collect.Lists; +import com.google.common.collect.Maps; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import java.util.List; +import java.util.Map; import io.fabric8.kubernetes.api.model.EndpointAddress; import io.fabric8.kubernetes.api.model.EndpointPort; @@ -32,13 +34,19 @@ public class K8sDeployManager extends DeployManager { public static final String ENV_FE_OBSERVER_NAMESPACE = "FE_OBSERVER_NAMESPACE"; public static final String ENV_BE_SERVICE_NAME = "BE_SERVICE_NAME"; public static final String ENV_BE_NAMESPACE = "BE_NAMESPACE"; + public static final String ENV_BROKER_SERVICE_NAME = "BROKER_SERVICE_NAME"; + public static final String ENV_BROKER_NAMESPACE = "BROKER_NAMESPACE"; + public static final String ENV_BROKER_NAME = "BROKER_NAME"; + public static final String FE_PORT = "edit-log-port"; // k8s only support -, not _ public static final String BE_PORT = "heartbeat-port"; + public static final String BROKER_PORT = "broker-port"; private String feNamespace; private String observerNamespace; private String beNamespace; + private String brokerNamespace; private KubernetesClient client = null; // =======for test only========== @@ -53,7 +61,8 @@ public class K8sDeployManager extends DeployManager { public K8sDeployManager(Catalog catalog, long intervalMs) { super(catalog, intervalMs); - initEnvVariables(ENV_FE_SERVICE_NAME, ENV_FE_OBSERVER_SERVICE_NAME, ENV_BE_SERVICE_NAME, ""); + initEnvVariables(ENV_FE_SERVICE_NAME, ENV_FE_OBSERVER_SERVICE_NAME, ENV_BE_SERVICE_NAME, + ENV_BROKER_SERVICE_NAME); } @Override @@ -62,23 +71,32 @@ public class K8sDeployManager extends DeployManager { super.initEnvVariables(envElectableFeServiceGroup, envObserverFeServiceGroup, envBackendServiceGroup, envBrokerServiceGroup); + // namespaces feNamespace = Strings.nullToEmpty(System.getenv(ENV_FE_NAMESPACE)); beNamespace = Strings.nullToEmpty(System.getenv(ENV_BE_NAMESPACE)); + // FE and BE namespace must exist if (Strings.isNullOrEmpty(feNamespace) || Strings.isNullOrEmpty(beNamespace)) { LOG.error("failed to init namespace. feNamespace: {}, beNamespace: {}", feNamespace, observerNamespace, beNamespace); System.exit(-1); } + // observer namespace observerNamespace = Strings.nullToEmpty(System.getenv(ENV_FE_OBSERVER_NAMESPACE)); if (Strings.isNullOrEmpty(observerNamespace)) { LOG.warn("failed to init observer namespace."); hasObserverService = false; } - LOG.info("get namespace. feNamespace: {}, observerNamespace: {}, beNamespace: {}", - feNamespace, observerNamespace, beNamespace); + brokerNamespace = Strings.nullToEmpty(System.getenv(ENV_BROKER_NAMESPACE)); + if (Strings.isNullOrEmpty(brokerNamespace)) { + LOG.warn("failed to init broker namespace."); + hasBrokerService = false; + } + + LOG.info("get namespace. feNamespace: {}, observerNamespace: {}, beNamespace: {}, brokerNamespace: {}", + feNamespace, observerNamespace, beNamespace, brokerNamespace); } @Override @@ -95,6 +113,9 @@ public class K8sDeployManager extends DeployManager { } else if (groupName.equals(backendServiceGroup)) { namespace = beNamespace; portName = BE_PORT; + } else if (groupName.equals(brokerServiceGroup)) { + namespace = brokerNamespace; + portName = BROKER_PORT; } else { LOG.warn("unknown service group name: {}", groupName); return null; @@ -144,6 +165,24 @@ public class K8sDeployManager extends DeployManager { return result; } + @Override + protected Map>> getBrokerGroupHostPorts() { + List> hostPorts = getGroupHostPorts(brokerServiceGroup); + if (hostPorts == null) { + return null; + } + final String brokerName = System.getenv(ENV_BROKER_NAME); + if (Strings.isNullOrEmpty(brokerName)) { + LOG.error("failed to get broker name from env: {}", ENV_BROKER_NAME); + System.exit(-1); + } + + Map>> brokers = Maps.newHashMap(); + brokers.put(brokerName, hostPorts); + LOG.info("get brokers from k8s: {}", brokers); + return brokers; + } + private Endpoints endpoints(String namespace, String serviceName) throws Exception { return client().endpoints().inNamespace(namespace).withName(serviceName).get(); } diff --git a/fe/src/com/baidu/palo/ha/MasterInfo.java b/fe/src/com/baidu/palo/ha/MasterInfo.java index 2ef842274d..1af4bbc9f0 100644 --- a/fe/src/com/baidu/palo/ha/MasterInfo.java +++ b/fe/src/com/baidu/palo/ha/MasterInfo.java @@ -13,63 +13,70 @@ // specific language governing permissions and limitations // under the License. -package com.baidu.palo.ha; - -import com.baidu.palo.common.io.Text; -import com.baidu.palo.common.io.Writable; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - -public class MasterInfo implements Writable { - - private String ip; - private int httpPort; - private int rpcPort; - - public MasterInfo() { - this.ip = ""; - this.httpPort = 0; - this.rpcPort = 0; - } - - public String getIp() { - return this.ip; - } - - public void setIp(String ip) { - this.ip = ip; - } - - public int getHttpPort() { - return this.httpPort; - } - - public void setHttpPort(int httpPort) { - this.httpPort = httpPort; - } - - public int getRpcPort() { - return this.rpcPort; - } - - public void setRpcPort(int rpcPort) { - this.rpcPort = rpcPort; - } - - @Override - public void write(DataOutput out) throws IOException { - Text.writeString(out, ip); - out.writeInt(httpPort); - out.writeInt(rpcPort); - } - - @Override - public void readFields(DataInput in) throws IOException { - ip = Text.readString(in); - httpPort = in.readInt(); - rpcPort = in.readInt(); - } - -} +package com.baidu.palo.ha; + +import com.baidu.palo.common.io.Text; +import com.baidu.palo.common.io.Writable; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; + +public class MasterInfo implements Writable { + + private String ip; + private int httpPort; + private int rpcPort; + + public MasterInfo() { + this.ip = ""; + this.httpPort = 0; + this.rpcPort = 0; + } + + public MasterInfo(String ip, int httpPort, int rpcPort) { + this.ip = ip; + this.httpPort = httpPort; + this.rpcPort = rpcPort; + } + + public String getIp() { + return this.ip; + } + + public void setIp(String ip) { + this.ip = ip; + } + + public int getHttpPort() { + return this.httpPort; + } + + public void setHttpPort(int httpPort) { + this.httpPort = httpPort; + } + + public int getRpcPort() { + return this.rpcPort; + } + + public void setRpcPort(int rpcPort) { + this.rpcPort = rpcPort; + } + + @Override + public void write(DataOutput out) throws IOException { + Text.writeString(out, ip); + out.writeInt(httpPort); + out.writeInt(rpcPort); + } + + @Override + public void readFields(DataInput in) throws IOException { + ip = Text.readString(in); + httpPort = in.readInt(); + rpcPort = in.readInt(); + } + +} + diff --git a/fe/src/com/baidu/palo/http/BaseRequest.java b/fe/src/com/baidu/palo/http/BaseRequest.java index 49bab02450..5b92263081 100644 --- a/fe/src/com/baidu/palo/http/BaseRequest.java +++ b/fe/src/com/baidu/palo/http/BaseRequest.java @@ -18,6 +18,11 @@ package com.baidu.palo.http; import com.google.common.base.Strings; import com.google.common.collect.Maps; +import java.net.InetSocketAddress; +import java.util.List; +import java.util.Map; +import java.util.Set; + import io.netty.channel.ChannelHandlerContext; import io.netty.handler.codec.http.Cookie; import io.netty.handler.codec.http.CookieDecoder; @@ -25,11 +30,6 @@ import io.netty.handler.codec.http.HttpHeaders; import io.netty.handler.codec.http.HttpRequest; import io.netty.handler.codec.http.QueryStringDecoder; -import java.net.InetSocketAddress; -import java.util.List; -import java.util.Map; -import java.util.Set; - public class BaseRequest { protected ChannelHandlerContext context; protected HttpRequest request; @@ -140,7 +140,7 @@ public class BaseRequest { } public String getHostString() { - // get client host + // get client host InetSocketAddress clientSocket = (InetSocketAddress) context.channel().remoteAddress(); String clientIp = clientSocket.getHostString(); return clientIp; diff --git a/fe/src/com/baidu/palo/http/meta/MetaService.java b/fe/src/com/baidu/palo/http/meta/MetaService.java index 4c55e4fc7a..74e36e320d 100644 --- a/fe/src/com/baidu/palo/http/meta/MetaService.java +++ b/fe/src/com/baidu/palo/http/meta/MetaService.java @@ -266,7 +266,7 @@ public class MetaService { * the fe with the given ip and port. When one frontend start, it should check * the local electable_nodes config and local cluster id with other frontends. * If there is any difference, local fe will exit. This is designed to protect - * the consistance of the cluster. + * the consistency of the cluster. */ public static class CheckAction extends MetaBaseAction { private static final Logger LOG = LogManager.getLogger(CheckAction.class); @@ -275,7 +275,7 @@ public class MetaService { super(controller, imageDir); } - public static void registerAction (ActionController controller, File imageDir) + public static void registerAction(ActionController controller, File imageDir) throws IllegalArgException { controller.registerHandler(HttpMethod.GET, "/check", new CheckAction(controller, imageDir)); diff --git a/fe/src/com/baidu/palo/http/rest/AddBackendAction.java b/fe/src/com/baidu/palo/http/rest/AddBackendAction.java new file mode 100644 index 0000000000..8129954d86 --- /dev/null +++ b/fe/src/com/baidu/palo/http/rest/AddBackendAction.java @@ -0,0 +1,89 @@ +// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package com.baidu.palo.http.rest; + +import com.baidu.palo.catalog.Catalog; +import com.baidu.palo.common.AnalysisException; +import com.baidu.palo.common.DdlException; +import com.baidu.palo.common.Pair; +import com.baidu.palo.http.ActionController; +import com.baidu.palo.http.BaseRequest; +import com.baidu.palo.http.BaseResponse; +import com.baidu.palo.http.IllegalArgException; +import com.baidu.palo.system.SystemInfoService; + +import com.google.common.base.Strings; +import com.google.common.collect.Lists; + +import java.util.List; + +import io.netty.handler.codec.http.HttpMethod; + +/* + * fe_host:fe_http_port/api/add_backend?host_ports=host:port,host2:port2... + * return: + * {"status":"OK","msg":"Success"} + * {"status":"FAILED","msg":"err info..."} + */ +public class AddBackendAction extends RestBaseAction { + public static final String HOST_PORTS = "host_ports"; + + public AddBackendAction(ActionController controller) { + super(controller); + } + + public static void registerAction(ActionController controller) throws IllegalArgException { + controller.registerHandler(HttpMethod.GET, "/api/add_backend", new AddBackendAction(controller)); + } + + @Override + public void execute(BaseRequest request, BaseResponse response) throws DdlException { + String hostPorts = request.getSingleParameter(HOST_PORTS); + if (Strings.isNullOrEmpty(hostPorts)) { + throw new DdlException("No host:port specified."); + } + + String[] hostPortArr = hostPorts.split(","); + if (hostPortArr.length == 0) { + throw new DdlException("No host:port specified."); + } + + if (!Catalog.getInstance().isMaster()) { + throw new DdlException("I am not master"); + } + + List> hostPortPairs = Lists.newArrayList(); + for (String hostPort : hostPortArr) { + Pair pair; + try { + pair = SystemInfoService.validateHostAndPort(hostPort); + } catch (AnalysisException e) { + throw new DdlException(e.getMessage()); + } + hostPortPairs.add(pair); + } + + Catalog.getCurrentSystemInfo().addBackends(hostPortPairs, false); + + // to json response + RestBaseResult result = new RestBaseResult(); + + // send result + response.setContentType("application/json"); + response.getContent().append(result.toJson()); + sendResult(request, response); + } +} diff --git a/fe/src/com/baidu/palo/http/rest/AddFrontendAction.java b/fe/src/com/baidu/palo/http/rest/AddFrontendAction.java new file mode 100644 index 0000000000..1f36e4e378 --- /dev/null +++ b/fe/src/com/baidu/palo/http/rest/AddFrontendAction.java @@ -0,0 +1,112 @@ +// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package com.baidu.palo.http.rest; + +import com.baidu.palo.catalog.Catalog; +import com.baidu.palo.common.AnalysisException; +import com.baidu.palo.common.DdlException; +import com.baidu.palo.common.Pair; +import com.baidu.palo.ha.FrontendNodeType; +import com.baidu.palo.http.ActionController; +import com.baidu.palo.http.BaseRequest; +import com.baidu.palo.http.BaseResponse; +import com.baidu.palo.http.IllegalArgException; +import com.baidu.palo.system.SystemInfoService; + +import com.google.common.base.Strings; +import com.google.common.collect.Lists; + +import java.util.List; + +import io.netty.handler.codec.http.HttpMethod; + +/* + * fe_host:fe_http_port/api/add_frontend?role=follower\&host_ports=host:port,host2:port2... + * fe_host:fe_http_port/api/add_frontend?role=observer\&host_ports=host:port,host2:port2... + * return: + * {"status":"OK","msg":"Success"} + * {"status":"FAILED","msg":"err info..."} + */ +public class AddFrontendAction extends RestBaseAction { + public static final String ROLE = "role"; + public static final String FOLLOWER = "follower"; + public static final String OBSERVER = "observer"; + public static final String HOST_PORTS = "host_ports"; + + public AddFrontendAction(ActionController controller) { + super(controller); + } + + public static void registerAction(ActionController controller) throws IllegalArgException { + controller.registerHandler(HttpMethod.GET, "/api/add_frontend", new AddFrontendAction(controller)); + } + + @Override + public void execute(BaseRequest request, BaseResponse response) throws DdlException { + String role = request.getSingleParameter(ROLE); + if (Strings.isNullOrEmpty(role)) { + throw new DdlException("No frontend role specified."); + } + + if (!role.equals(FOLLOWER) && !role.equals(OBSERVER)) { + throw new DdlException("frontend role must specified to follower or observer"); + } + + String hostPorts = request.getSingleParameter(HOST_PORTS); + if (Strings.isNullOrEmpty(hostPorts)) { + throw new DdlException("No host:port specified."); + } + + String[] hostPortArr = hostPorts.split(","); + if (hostPortArr.length == 0) { + throw new DdlException("No host:port specified."); + } + + if (!Catalog.getInstance().isMaster()) { + throw new DdlException("I am not master"); + } + + List> hostPortPairs = Lists.newArrayList(); + for (String hostPort : hostPortArr) { + Pair pair; + try { + pair = SystemInfoService.validateHostAndPort(hostPort); + } catch (AnalysisException e) { + throw new DdlException(e.getMessage()); + } + hostPortPairs.add(pair); + } + + FrontendNodeType nodeType; + if (role.equals(FOLLOWER)) { + nodeType = FrontendNodeType.FOLLOWER; + } else { + nodeType = FrontendNodeType.OBSERVER; + } + + for (Pair hostPortPair : hostPortPairs) { + Catalog.getInstance().addFrontend(nodeType, hostPortPair.first, hostPortPair.second); + } + + // to json response + RestBaseResult result = new RestBaseResult(); + + // send result + response.setContentType("application/json"); + response.getContent().append(result.toJson()); + sendResult(request, response); + } +} diff --git a/fe/src/com/baidu/palo/journal/JournalEntity.java b/fe/src/com/baidu/palo/journal/JournalEntity.java index 797588daf3..8d5a4e44c1 100644 --- a/fe/src/com/baidu/palo/journal/JournalEntity.java +++ b/fe/src/com/baidu/palo/journal/JournalEntity.java @@ -33,6 +33,7 @@ import com.baidu.palo.load.ExportJob; import com.baidu.palo.load.LoadErrorHub; import com.baidu.palo.load.LoadJob; import com.baidu.palo.master.Checkpoint; +import com.baidu.palo.persist.BackendIdsUpdateInfo; import com.baidu.palo.persist.CloneInfo; import com.baidu.palo.persist.ClusterInfo; import com.baidu.palo.persist.ConsistencyCheckInfo; @@ -41,14 +42,12 @@ import com.baidu.palo.persist.DatabaseInfo; import com.baidu.palo.persist.DropInfo; import com.baidu.palo.persist.DropLinkDbAndUpdateDbInfo; import com.baidu.palo.persist.DropPartitionInfo; -import com.baidu.palo.persist.LinkDbInfo; import com.baidu.palo.persist.ModifyPartitionInfo; import com.baidu.palo.persist.OperationType; import com.baidu.palo.persist.PartitionPersistInfo; import com.baidu.palo.persist.RecoverInfo; import com.baidu.palo.persist.ReplicaPersistInfo; import com.baidu.palo.persist.TableInfo; -import com.baidu.palo.persist.BackendIdsUpdateInfo; import com.baidu.palo.qe.SessionVariable; import com.baidu.palo.system.Backend; import com.baidu.palo.system.Frontend; diff --git a/fe/src/com/baidu/palo/journal/bdbje/BDBEnvironment.java b/fe/src/com/baidu/palo/journal/bdbje/BDBEnvironment.java index 16d40f12db..4b5e019bf5 100644 --- a/fe/src/com/baidu/palo/journal/bdbje/BDBEnvironment.java +++ b/fe/src/com/baidu/palo/journal/bdbje/BDBEnvironment.java @@ -13,8 +13,8 @@ // specific language governing permissions and limitations // under the License. -package com.baidu.palo.journal.bdbje; - +package com.baidu.palo.journal.bdbje; + import com.baidu.palo.catalog.Catalog; import com.baidu.palo.common.Config; import com.baidu.palo.ha.BDBHA; @@ -28,7 +28,6 @@ import com.sleepycat.je.DatabaseNotFoundException; import com.sleepycat.je.Durability; import com.sleepycat.je.Durability.ReplicaAckPolicy; import com.sleepycat.je.Durability.SyncPolicy; -import com.sleepycat.je.Environment; import com.sleepycat.je.EnvironmentConfig; import com.sleepycat.je.EnvironmentFailureException; import com.sleepycat.je.rep.InsufficientLogException; @@ -53,175 +52,175 @@ import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.concurrent.TimeUnit; -import java.util.concurrent.locks.ReentrantReadWriteLock; - -/* this class contains the reference to bdb environment. - * including all the opened databases and the replicationGroupAdmin. - * we can get the information of this bdb group through the API of replicationGroupAdmin - */ -public class BDBEnvironment { - private static final Logger LOG = LogManager.getLogger(BDBEnvironment.class); - private static final int RETRY_TIME = 3; - private static final int MEMORY_CACHE_PERCENT = 20; - - public static final String PALO_JOURNAL_GROUP = "PALO_JOURNAL_GROUP"; - - private ReplicatedEnvironment replicatedEnvironment; - private EnvironmentConfig environmentConfig; - private ReplicationConfig replicationConfig; - private DatabaseConfig dbConfig; - private Database epochDB = null; // used for fencing - private ReplicationGroupAdmin replicationGroupAdmin = null; - private ReentrantReadWriteLock lock; - private List openedDatabases; - - public BDBEnvironment() { - openedDatabases = new ArrayList(); - this.lock = new ReentrantReadWriteLock(true); - } - - // The setup() method opens the environment and database - public void setup(File envHome, String selfNodeName, String selfNodeHostPort, - String helperHostPort, boolean isElectable) { - - // Almost never used, just in case the master can not restart - if (Config.metadata_failure_recovery.equals("true")) { - if (!isElectable) { - LOG.error("Current node is not in the electable_nodes list. will exit"); - System.exit(-1); - } - DbResetRepGroup resetUtility = new DbResetRepGroup(envHome, PALO_JOURNAL_GROUP, selfNodeName, - selfNodeHostPort); - resetUtility.reset(); - LOG.info("group has been reset."); - } - - // set replication config - replicationConfig = new ReplicationConfig(); - replicationConfig.setNodeName(selfNodeName); - replicationConfig.setNodeHostPort(selfNodeHostPort); - replicationConfig.setHelperHosts(helperHostPort); - replicationConfig.setGroupName(PALO_JOURNAL_GROUP); +import java.util.concurrent.locks.ReentrantReadWriteLock; + +/* this class contains the reference to bdb environment. + * including all the opened databases and the replicationGroupAdmin. + * we can get the information of this bdb group through the API of replicationGroupAdmin + */ +public class BDBEnvironment { + private static final Logger LOG = LogManager.getLogger(BDBEnvironment.class); + private static final int RETRY_TIME = 3; + private static final int MEMORY_CACHE_PERCENT = 20; + + public static final String PALO_JOURNAL_GROUP = "PALO_JOURNAL_GROUP"; + + private ReplicatedEnvironment replicatedEnvironment; + private EnvironmentConfig environmentConfig; + private ReplicationConfig replicationConfig; + private DatabaseConfig dbConfig; + private Database epochDB = null; // used for fencing + private ReplicationGroupAdmin replicationGroupAdmin = null; + private ReentrantReadWriteLock lock; + private List openedDatabases; + + public BDBEnvironment() { + openedDatabases = new ArrayList(); + this.lock = new ReentrantReadWriteLock(true); + } + + // The setup() method opens the environment and database + public void setup(File envHome, String selfNodeName, String selfNodeHostPort, + String helperHostPort, boolean isElectable) { + + // Almost never used, just in case the master can not restart + if (Config.metadata_failure_recovery.equals("true")) { + if (!isElectable) { + LOG.error("Current node is not in the electable_nodes list. will exit"); + System.exit(-1); + } + DbResetRepGroup resetUtility = new DbResetRepGroup(envHome, PALO_JOURNAL_GROUP, selfNodeName, + selfNodeHostPort); + resetUtility.reset(); + LOG.info("group has been reset."); + } + + // set replication config + replicationConfig = new ReplicationConfig(); + replicationConfig.setNodeName(selfNodeName); + replicationConfig.setNodeHostPort(selfNodeHostPort); + replicationConfig.setHelperHosts(helperHostPort); + replicationConfig.setGroupName(PALO_JOURNAL_GROUP); replicationConfig.setConfigParam(ReplicationConfig.ENV_UNKNOWN_STATE_TIMEOUT, "10"); replicationConfig.setMaxClockDelta(Config.max_bdbje_clock_delta_ms, TimeUnit.MILLISECONDS); - - if (isElectable) { - replicationConfig.setReplicaAckTimeout(2, TimeUnit.SECONDS); + + if (isElectable) { + replicationConfig.setReplicaAckTimeout(2, TimeUnit.SECONDS); replicationConfig.setConfigParam(ReplicationConfig.REPLICA_MAX_GROUP_COMMIT, "0"); - replicationConfig.setConsistencyPolicy(new NoConsistencyRequiredPolicy()); - } else { - replicationConfig.setNodeType(NodeType.SECONDARY); - replicationConfig.setConsistencyPolicy(new NoConsistencyRequiredPolicy()); - } - - // set environment config - environmentConfig = new EnvironmentConfig(); - environmentConfig.setTransactional(true); - environmentConfig.setAllowCreate(true); - environmentConfig.setCachePercent(MEMORY_CACHE_PERCENT); - if (isElectable) { - Durability durability = new Durability(getSyncPolicy(Config.master_sync_policy), - getSyncPolicy(Config.replica_sync_policy), getAckPolicy(Config.replica_ack_policy)); - environmentConfig.setDurability(durability); - } - - // set database config - dbConfig = new DatabaseConfig(); - dbConfig.setTransactional(true); - if (isElectable) { - dbConfig.setAllowCreate(true); - dbConfig.setReadOnly(false); - } else { - dbConfig.setAllowCreate(false); - dbConfig.setReadOnly(true); - } - - // open environment and epochDB - for (int i = 0; i < RETRY_TIME; i++) { - try { - // open the environment - replicatedEnvironment = new ReplicatedEnvironment(envHome, replicationConfig, environmentConfig); - - // get replicationGroupAdmin object. - Set adminNodes = new HashSet(); - // 1. add helper node - InetSocketAddress helper = new InetSocketAddress(helperHostPort.split(":")[0], - Integer.parseInt(helperHostPort.split(":")[1])); - adminNodes.add(helper); - LOG.info("add helper[{}] as ReplicationGroupAdmin", helperHostPort); - // 2. add self if is electable - if (!selfNodeHostPort.equals(helperHostPort) && Catalog.getInstance().isElectable()) { - InetSocketAddress self = new InetSocketAddress(selfNodeHostPort.split(":")[0], - Integer.parseInt(selfNodeHostPort.split(":")[1])); - adminNodes.add(self); - LOG.info("add self[{}] as ReplicationGroupAdmin", selfNodeHostPort); - } - + replicationConfig.setConsistencyPolicy(new NoConsistencyRequiredPolicy()); + } else { + replicationConfig.setNodeType(NodeType.SECONDARY); + replicationConfig.setConsistencyPolicy(new NoConsistencyRequiredPolicy()); + } + + // set environment config + environmentConfig = new EnvironmentConfig(); + environmentConfig.setTransactional(true); + environmentConfig.setAllowCreate(true); + environmentConfig.setCachePercent(MEMORY_CACHE_PERCENT); + if (isElectable) { + Durability durability = new Durability(getSyncPolicy(Config.master_sync_policy), + getSyncPolicy(Config.replica_sync_policy), getAckPolicy(Config.replica_ack_policy)); + environmentConfig.setDurability(durability); + } + + // set database config + dbConfig = new DatabaseConfig(); + dbConfig.setTransactional(true); + if (isElectable) { + dbConfig.setAllowCreate(true); + dbConfig.setReadOnly(false); + } else { + dbConfig.setAllowCreate(false); + dbConfig.setReadOnly(true); + } + + // open environment and epochDB + for (int i = 0; i < RETRY_TIME; i++) { + try { + // open the environment + replicatedEnvironment = new ReplicatedEnvironment(envHome, replicationConfig, environmentConfig); + + // get replicationGroupAdmin object. + Set adminNodes = new HashSet(); + // 1. add helper node + InetSocketAddress helper = new InetSocketAddress(helperHostPort.split(":")[0], + Integer.parseInt(helperHostPort.split(":")[1])); + adminNodes.add(helper); + LOG.info("add helper[{}] as ReplicationGroupAdmin", helperHostPort); + // 2. add self if is electable + if (!selfNodeHostPort.equals(helperHostPort) && Catalog.getInstance().isElectable()) { + InetSocketAddress self = new InetSocketAddress(selfNodeHostPort.split(":")[0], + Integer.parseInt(selfNodeHostPort.split(":")[1])); + adminNodes.add(self); + LOG.info("add self[{}] as ReplicationGroupAdmin", selfNodeHostPort); + } + replicationGroupAdmin = new ReplicationGroupAdmin(PALO_JOURNAL_GROUP, adminNodes); - - // get a BDBHA object and pass the reference to Catalog - HAProtocol protocol = new BDBHA(this, selfNodeName); - Catalog.getInstance().setHaProtocol(protocol); - - // start state change listener - StateChangeListener listener = new BDBStateChangeListener(); - replicatedEnvironment.setStateChangeListener(listener); - - // open epochDB. the first parameter null means auto-commit - epochDB = replicatedEnvironment.openDatabase(null, "epochDB", dbConfig); - break; - } catch (InsufficientLogException insufficientLogEx) { - NetworkRestore restore = new NetworkRestore(); - NetworkRestoreConfig config = new NetworkRestoreConfig(); - config.setRetainLogFiles(false); // delete obsolete log files. - // Use the members returned by insufficientLogEx.getLogProviders() - // to select the desired subset of members and pass the resulting - // list as the argument to config.setLogProviders(), if the - // default selection of providers is not suitable. - restore.execute(insufficientLogEx, config); - continue; - } catch (DatabaseException e) { - if (i < RETRY_TIME - 1) { - try { - Thread.sleep(5 * 1000); - } catch (InterruptedException e1) { - e1.printStackTrace(); - } - continue; - } else { - LOG.error("error to open replicated environment. will exit.", e); - System.exit(-1); - } - } - } - } - - public ReplicationGroupAdmin getReplicationGroupAdmin() { - return this.replicationGroupAdmin; + + // get a BDBHA object and pass the reference to Catalog + HAProtocol protocol = new BDBHA(this, selfNodeName); + Catalog.getInstance().setHaProtocol(protocol); + + // start state change listener + StateChangeListener listener = new BDBStateChangeListener(); + replicatedEnvironment.setStateChangeListener(listener); + + // open epochDB. the first parameter null means auto-commit + epochDB = replicatedEnvironment.openDatabase(null, "epochDB", dbConfig); + break; + } catch (InsufficientLogException insufficientLogEx) { + NetworkRestore restore = new NetworkRestore(); + NetworkRestoreConfig config = new NetworkRestoreConfig(); + config.setRetainLogFiles(false); // delete obsolete log files. + // Use the members returned by insufficientLogEx.getLogProviders() + // to select the desired subset of members and pass the resulting + // list as the argument to config.setLogProviders(), if the + // default selection of providers is not suitable. + restore.execute(insufficientLogEx, config); + continue; + } catch (DatabaseException e) { + if (i < RETRY_TIME - 1) { + try { + Thread.sleep(5 * 1000); + } catch (InterruptedException e1) { + e1.printStackTrace(); + } + continue; + } else { + LOG.error("error to open replicated environment. will exit.", e); + System.exit(-1); + } + } + } + } + + public ReplicationGroupAdmin getReplicationGroupAdmin() { + return this.replicationGroupAdmin; } public void setNewReplicationGroupAdmin(Set newHelperNodes) { this.replicationGroupAdmin = new ReplicationGroupAdmin(PALO_JOURNAL_GROUP, newHelperNodes); } - - // Return a handle to the epochDB - public Database getEpochDB() { - return epochDB; - } - - // Return a handle to the environment - public Environment getReplicatedEnvironment() { - return replicatedEnvironment; - } - + + // Return a handle to the epochDB + public Database getEpochDB() { + return epochDB; + } + + // Return a handle to the environment + public ReplicatedEnvironment getReplicatedEnvironment() { + return replicatedEnvironment; + } + // return the database reference with the given name - // also try to close previous opened database. - public Database openDatabase(String dbName) { - Database db = null; - lock.writeLock().lock(); + // also try to close previous opened database. + public Database openDatabase(String dbName) { + Database db = null; + lock.writeLock().lock(); try { - // find if the specified database is already opened. find and return it. - for (java.util.Iterator iter = openedDatabases.iterator(); iter.hasNext();) { + // find if the specified database is already opened. find and return it. + for (java.util.Iterator iter = openedDatabases.iterator(); iter.hasNext();) { Database openedDb = iter.next(); try { if (openedDb.getDatabaseName() == null) { @@ -252,156 +251,157 @@ public class BDBEnvironment { iter.remove(); continue; } - - if (openedDb.getDatabaseName().equals(dbName)) { - return openedDb; - } - } - // open the specified database. - // the first parameter null means auto-commit - try { - db = replicatedEnvironment.openDatabase(null, dbName, dbConfig); - openedDatabases.add(db); - } catch (Exception e) { - LOG.warn("catch an exception when open database {}", dbName, e); - } - } finally { - lock.writeLock().unlock(); - } - return db; - } - - // close and remove the database whose name is dbName - public void removeDatabase(String dbName) { - lock.writeLock().lock(); - try { - String targetDbName = null; - int index = 0; - for (Database db : openedDatabases) { - String name = db.getDatabaseName(); - if (dbName.equals(name)) { + if (openedDb.getDatabaseName().equals(dbName)) { + return openedDb; + } + } + + // open the specified database. + // the first parameter null means auto-commit + try { + db = replicatedEnvironment.openDatabase(null, dbName, dbConfig); + openedDatabases.add(db); + } catch (Exception e) { + LOG.warn("catch an exception when open database {}", dbName, e); + } + } finally { + lock.writeLock().unlock(); + } + return db; + } + + // close and remove the database whose name is dbName + public void removeDatabase(String dbName) { + lock.writeLock().lock(); + try { + String targetDbName = null; + int index = 0; + for (Database db : openedDatabases) { + String name = db.getDatabaseName(); + if (dbName.equals(name)) { db.close(); - LOG.info("database {} has been closed", name); - targetDbName = name; - break; - } - index++; - } - if (targetDbName != null) { - LOG.info("begin to remove database {} from openedDatabases", targetDbName); - openedDatabases.remove(index); - } - try { - LOG.info("begin to remove database {} from replicatedEnviroment", dbName); - // the first parameter null means auto-commit - replicatedEnvironment.removeDatabase(null, dbName); - } catch (DatabaseNotFoundException e) { - LOG.warn("catch an exception when remove db:{}, this db does not exist", dbName, e); - } - } finally { - lock.writeLock().unlock(); - } - } - - // get journal db names and sort the names - public List getDatabaseNames() { - List ret = new ArrayList(); - List names = null; - int tried = 0; - while (true) { - try { - names = replicatedEnvironment.getDatabaseNames(); - break; - } catch (InsufficientLogException e) { - throw e; - } catch (EnvironmentFailureException e) { - tried++; - if (tried == RETRY_TIME) { - LOG.error("bdb environment failure exception.", e); - System.exit(-1); - } - LOG.warn("bdb environment failure exception. will retry", e); - try { - Thread.sleep(1000); - } catch (InterruptedException e1) { - e1.printStackTrace(); - } - continue; - } catch (DatabaseException e) { - LOG.warn("catch an exception when calling getDatabaseNames", e); - return null; - } - } - - if (names != null) { - for (String name : names) { - // We don't count epochDB - if (name.equals("epochDB")) { - continue; - } - - long db = Long.parseLong(name); - ret.add(db); - } - } - - Collections.sort(ret); - return ret; - } - - // Close the store and environment - public void close() { - for (Database db : openedDatabases) { - try { - db.close(); - } catch (DatabaseException exception) { - LOG.error("Error closing db {} will exit", db.getDatabaseName(), exception); - System.exit(-1); - } - } - openedDatabases.clear(); - - if (epochDB != null) { - try { - epochDB.close(); - } catch (DatabaseException exception) { - LOG.error("Error closing db {} will exit", epochDB.getDatabaseName(), exception); - System.exit(-1); - } - } - - if (replicatedEnvironment != null) { - try { - // Finally, close the store and environment. - replicatedEnvironment.close(); - } catch (DatabaseException exception) { - LOG.error("Error closing replicatedEnviroment", exception); - System.exit(-1); - } - } - } - - private SyncPolicy getSyncPolicy(String policy) { - if (policy.equalsIgnoreCase("SYNC")) { - return Durability.SyncPolicy.SYNC; - } - if (policy.equalsIgnoreCase("NO_SYNC")) { - return Durability.SyncPolicy.NO_SYNC; - } - // default value is WRITE_NO_SYNC - return Durability.SyncPolicy.WRITE_NO_SYNC; - } - - private ReplicaAckPolicy getAckPolicy(String policy) { - if (policy.equalsIgnoreCase("ALL")) { - return Durability.ReplicaAckPolicy.ALL; - } - if (policy.equalsIgnoreCase("NONE")) { - return Durability.ReplicaAckPolicy.NONE; - } - // default value is SIMPLE_MAJORITY - return Durability.ReplicaAckPolicy.SIMPLE_MAJORITY; - } - -} + LOG.info("database {} has been closed", name); + targetDbName = name; + break; + } + index++; + } + if (targetDbName != null) { + LOG.info("begin to remove database {} from openedDatabases", targetDbName); + openedDatabases.remove(index); + } + try { + LOG.info("begin to remove database {} from replicatedEnviroment", dbName); + // the first parameter null means auto-commit + replicatedEnvironment.removeDatabase(null, dbName); + } catch (DatabaseNotFoundException e) { + LOG.warn("catch an exception when remove db:{}, this db does not exist", dbName, e); + } + } finally { + lock.writeLock().unlock(); + } + } + + // get journal db names and sort the names + public List getDatabaseNames() { + List ret = new ArrayList(); + List names = null; + int tried = 0; + while (true) { + try { + names = replicatedEnvironment.getDatabaseNames(); + break; + } catch (InsufficientLogException e) { + throw e; + } catch (EnvironmentFailureException e) { + tried++; + if (tried == RETRY_TIME) { + LOG.error("bdb environment failure exception.", e); + System.exit(-1); + } + LOG.warn("bdb environment failure exception. will retry", e); + try { + Thread.sleep(1000); + } catch (InterruptedException e1) { + e1.printStackTrace(); + } + continue; + } catch (DatabaseException e) { + LOG.warn("catch an exception when calling getDatabaseNames", e); + return null; + } + } + + if (names != null) { + for (String name : names) { + // We don't count epochDB + if (name.equals("epochDB")) { + continue; + } + + long db = Long.parseLong(name); + ret.add(db); + } + } + + Collections.sort(ret); + return ret; + } + + // Close the store and environment + public void close() { + for (Database db : openedDatabases) { + try { + db.close(); + } catch (DatabaseException exception) { + LOG.error("Error closing db {} will exit", db.getDatabaseName(), exception); + System.exit(-1); + } + } + openedDatabases.clear(); + + if (epochDB != null) { + try { + epochDB.close(); + } catch (DatabaseException exception) { + LOG.error("Error closing db {} will exit", epochDB.getDatabaseName(), exception); + System.exit(-1); + } + } + + if (replicatedEnvironment != null) { + try { + // Finally, close the store and environment. + replicatedEnvironment.close(); + } catch (DatabaseException exception) { + LOG.error("Error closing replicatedEnviroment", exception); + System.exit(-1); + } + } + } + + private SyncPolicy getSyncPolicy(String policy) { + if (policy.equalsIgnoreCase("SYNC")) { + return Durability.SyncPolicy.SYNC; + } + if (policy.equalsIgnoreCase("NO_SYNC")) { + return Durability.SyncPolicy.NO_SYNC; + } + // default value is WRITE_NO_SYNC + return Durability.SyncPolicy.WRITE_NO_SYNC; + } + + private ReplicaAckPolicy getAckPolicy(String policy) { + if (policy.equalsIgnoreCase("ALL")) { + return Durability.ReplicaAckPolicy.ALL; + } + if (policy.equalsIgnoreCase("NONE")) { + return Durability.ReplicaAckPolicy.NONE; + } + // default value is SIMPLE_MAJORITY + return Durability.ReplicaAckPolicy.SIMPLE_MAJORITY; + } + +} + diff --git a/fe/src/com/baidu/palo/journal/bdbje/BDBJournalCursor.java b/fe/src/com/baidu/palo/journal/bdbje/BDBJournalCursor.java index 8a944d0fd5..08b4ad57b0 100644 --- a/fe/src/com/baidu/palo/journal/bdbje/BDBJournalCursor.java +++ b/fe/src/com/baidu/palo/journal/bdbje/BDBJournalCursor.java @@ -13,8 +13,8 @@ // specific language governing permissions and limitations // under the License. -package com.baidu.palo.journal.bdbje; - +package com.baidu.palo.journal.bdbje; + import com.baidu.palo.journal.JournalCursor; import com.baidu.palo.journal.JournalEntity; @@ -29,118 +29,131 @@ import org.apache.logging.log4j.Logger; import java.io.ByteArrayInputStream; import java.io.DataInputStream; -import java.util.List; - -public class BDBJournalCursor implements JournalCursor { - private static final Logger LOG = LogManager.getLogger(JournalCursor.class); - - private long toKey; - private long currentKey; - private BDBEnvironment environment; - private List dbNames; - private Database database; - private int nextDbPositionIndex; - private final int maxTryTime = 3; - - public static BDBJournalCursor getJournalCursor(BDBEnvironment env, long fromKey, long toKey) { - if (toKey < fromKey || fromKey < 0) { - System.out.println("Invalid key range!"); - return null; - } - BDBJournalCursor cursor = null; - try { - cursor = new BDBJournalCursor(env, fromKey, toKey); - } catch (Exception e) { - LOG.error("new BDBJournalCursor error.", e); - } - return cursor; - } - - - private BDBJournalCursor(BDBEnvironment env, long fromKey, long toKey) throws Exception { - this.environment = env; - this.toKey = toKey; - this.currentKey = fromKey; - this.dbNames = env.getDatabaseNames(); - if (dbNames == null) { - throw new NullPointerException("dbNames is null."); - } - this.nextDbPositionIndex = 0; - - // find the db which may contain the fromKey - String dbName = null; - for (long db : dbNames) { - if (fromKey >= db) { - dbName = Long.toString(db); - nextDbPositionIndex++; - continue; - } else { - break; - } - } - - if (dbName == null) { - LOG.error("Can not find the key:{}, fail to get journal cursor. will exit.", fromKey); - System.exit(-1); - } - this.database = env.openDatabase(dbName); - } - - @Override - public JournalEntity next() { - JournalEntity ret = null; - if (currentKey > toKey) { - return ret; - } - Long key = new Long(currentKey); - DatabaseEntry theKey = new DatabaseEntry(); - TupleBinding myBinding = TupleBinding.getPrimitiveBinding(Long.class); - myBinding.objectToEntry(key, theKey); - - DatabaseEntry theData = new DatabaseEntry(); - // if current db does not contain any more data, then we go to search the next db - try { - // null means perform the operation without transaction protection. - // READ_COMMITTED guarantees no dirty read. - int tryTimes = 0; +import java.util.List; + +public class BDBJournalCursor implements JournalCursor { + private static final Logger LOG = LogManager.getLogger(JournalCursor.class); + + private long toKey; + private long currentKey; + private BDBEnvironment environment; + private List dbNames; + private Database database; + private int nextDbPositionIndex; + private final int maxTryTime = 3; + + public static BDBJournalCursor getJournalCursor(BDBEnvironment env, long fromKey, long toKey) { + if (toKey < fromKey || fromKey < 0) { + System.out.println("Invalid key range!"); + return null; + } + BDBJournalCursor cursor = null; + try { + cursor = new BDBJournalCursor(env, fromKey, toKey); + } catch (Exception e) { + LOG.error("new BDBJournalCursor error.", e); + } + return cursor; + } + + + private BDBJournalCursor(BDBEnvironment env, long fromKey, long toKey) throws Exception { + this.environment = env; + this.toKey = toKey; + this.currentKey = fromKey; + this.dbNames = env.getDatabaseNames(); + if (dbNames == null) { + throw new NullPointerException("dbNames is null."); + } + this.nextDbPositionIndex = 0; + + // find the db which may contain the fromKey + String dbName = null; + for (long db : dbNames) { + if (fromKey >= db) { + dbName = Long.toString(db); + nextDbPositionIndex++; + continue; + } else { + break; + } + } + + if (dbName == null) { + LOG.error("Can not find the key:{}, fail to get journal cursor. will exit.", fromKey); + System.exit(-1); + } + this.database = env.openDatabase(dbName); + } + + @Override + public JournalEntity next() { + JournalEntity ret = null; + if (currentKey > toKey) { + return ret; + } + Long key = new Long(currentKey); + DatabaseEntry theKey = new DatabaseEntry(); + TupleBinding myBinding = TupleBinding.getPrimitiveBinding(Long.class); + myBinding.objectToEntry(key, theKey); + + DatabaseEntry theData = new DatabaseEntry(); + // if current db does not contain any more data, then we go to search the next db + try { + // null means perform the operation without transaction protection. + // READ_COMMITTED guarantees no dirty read. + int tryTimes = 0; while (true) { - OperationStatus operationStatus = database.get(null, theKey, theData, LockMode.READ_COMMITTED); - if (operationStatus == OperationStatus.SUCCESS) { - // Recreate the data String. - byte[] retData = theData.getData(); - DataInputStream in = new DataInputStream(new ByteArrayInputStream(retData)); - ret = new JournalEntity(); - try { - ret.readFields(in); - } catch (Exception e) { - LOG.error("fail to read journal entity key={}, will exit", currentKey, e); - System.exit(-1); - } - currentKey++; - return ret; - } else if (nextDbPositionIndex < dbNames.size() && currentKey == dbNames.get(nextDbPositionIndex)) { - database = environment.openDatabase(dbNames.get(nextDbPositionIndex).toString()); - nextDbPositionIndex++; - tryTimes = 0; - continue; - } else if (tryTimes < maxTryTime) { - tryTimes++; - LOG.warn("fail to get journal {}, will try again. status: {}", currentKey, operationStatus); - Thread.sleep(3000); - continue; - } else { - LOG.error("fail to get journal {}, will exit", currentKey); - System.exit(-1); - } - } - } catch (Exception e) { - LOG.warn("Catch an exception when get next JournalEntity. key:{}", currentKey, e); - return null; - } - } - - @Override - public void close() { - - } -} + OperationStatus operationStatus = database.get(null, theKey, theData, LockMode.READ_COMMITTED); + if (operationStatus == OperationStatus.SUCCESS) { + // Recreate the data String. + byte[] retData = theData.getData(); + DataInputStream in = new DataInputStream(new ByteArrayInputStream(retData)); + ret = new JournalEntity(); + try { + ret.readFields(in); + } catch (Exception e) { + LOG.error("fail to read journal entity key={}, will exit", currentKey, e); + System.exit(-1); + } + currentKey++; + return ret; + } else if (nextDbPositionIndex < dbNames.size() && currentKey == dbNames.get(nextDbPositionIndex)) { + database = environment.openDatabase(dbNames.get(nextDbPositionIndex).toString()); + nextDbPositionIndex++; + tryTimes = 0; + continue; + } else if (tryTimes < maxTryTime) { + tryTimes++; + LOG.warn("fail to get journal {}, will try again. status: {}", currentKey, operationStatus); + Thread.sleep(3000); + continue; + } else if (operationStatus == OperationStatus.NOTFOUND) { + // In the case: + // On non-master FE, the replayer will first get the max journal id, + // than try to replay logs from current replayed id to the max journal id. But when + // master FE try to write a log to bdbje, but crashed before this log is committed, + // the non-master FE may still get this incomplete log's id as max journal id, + // and try to replay it. We will first get LockTimeoutException (because the transaction + // is hanging and waiting to be aborted after timeout). and after this log abort, + // we will get NOTFOUND. + // So we simply throw a exception and let the replayer get the max id again. + throw new Exception( + "Failed to find key " + currentKey + " in database " + database.getDatabaseName()); + } else { + LOG.error("fail to get journal {}, status: {}, will exit", currentKey); + System.exit(-1); + } + } + } catch (Exception e) { + LOG.warn("Catch an exception when get next JournalEntity. key:{}", currentKey, e); + return null; + } + } + + @Override + public void close() { + + } +} + diff --git a/fe/src/com/baidu/palo/journal/bdbje/BDBTool.java b/fe/src/com/baidu/palo/journal/bdbje/BDBTool.java new file mode 100644 index 0000000000..2f2d964886 --- /dev/null +++ b/fe/src/com/baidu/palo/journal/bdbje/BDBTool.java @@ -0,0 +1,150 @@ +package com.baidu.palo.journal.bdbje; + +import com.baidu.palo.catalog.Catalog; +import com.baidu.palo.journal.JournalEntity; + +import com.google.common.base.Preconditions; +import com.google.common.base.Strings; +import com.google.common.collect.Maps; +import com.sleepycat.bind.tuple.TupleBinding; +import com.sleepycat.je.Database; +import com.sleepycat.je.DatabaseConfig; +import com.sleepycat.je.DatabaseEntry; +import com.sleepycat.je.DatabaseException; +import com.sleepycat.je.Environment; +import com.sleepycat.je.EnvironmentConfig; +import com.sleepycat.je.LockMode; +import com.sleepycat.je.OperationStatus; + +import org.json.JSONArray; +import org.json.JSONObject; + +import java.io.ByteArrayInputStream; +import java.io.DataInputStream; +import java.io.File; +import java.io.UnsupportedEncodingException; +import java.util.List; +import java.util.Map; + +public class BDBTool { + + private String metaPath; + private BDBToolOptions options; + + public BDBTool(String metaPath, BDBToolOptions options) { + this.metaPath = metaPath; + this.options = options; + } + + public boolean run() { + EnvironmentConfig envConfig = new EnvironmentConfig(); + envConfig.setAllowCreate(false); + envConfig.setReadOnly(true); + envConfig.setCachePercent(20); + + Environment env = null; + try { + env = new Environment(new File(metaPath), envConfig); + } catch (DatabaseException e) { + e.printStackTrace(); + System.err.println("Failed to open BDBJE env: " + Catalog.BDB_DIR + ". exit"); + return false; + } + Preconditions.checkNotNull(env); + + try { + if (options.isListDbs()) { + // list all databases + List dbNames = env.getDatabaseNames(); + JSONArray jsonArray = new JSONArray(dbNames); + System.out.println(jsonArray.toString()); + return true; + } else { + // db operations + String dbName = options.getDbName(); + Preconditions.checkState(!Strings.isNullOrEmpty(dbName)); + DatabaseConfig dbConfig = new DatabaseConfig(); + dbConfig.setAllowCreate(false); + dbConfig.setReadOnly(true); + Database db = env.openDatabase(null, dbName, dbConfig); + + if (options.isDbStat()) { + // get db stat + Map statMap = Maps.newHashMap(); + statMap.put("count", String.valueOf(db.count())); + JSONObject jsonObject = new JSONObject(statMap); + System.out.println(jsonObject.toString()); + return true; + } else { + // set from key + Long fromKey = 0L; + String fromKeyStr = options.hasFromKey() ? options.getFromKey() : dbName; + try { + fromKey = Long.valueOf(fromKeyStr); + } catch (NumberFormatException e) { + System.err.println("Not a valid from key: " + fromKeyStr); + return false; + } + + // set end key + Long endKey = fromKey + db.count() - 1; + if (options.hasEndKey()) { + try { + endKey = Long.valueOf(options.getEndKey()); + } catch (NumberFormatException e) { + System.err.println("Not a valid end key: " + options.getEndKey()); + return false; + } + } + + if (fromKey > endKey) { + System.err.println("from key should less than or equal to end key[" + + fromKey + " vs. " + endKey + "]"); + return false; + } + + // meta version + Catalog.getInstance().setJournalVersion(options.getMetaVersion()); + + for (Long key = fromKey; key <= endKey; key++) { + getValueByKey(db, key); + } + } + } + } catch (Exception e) { + e.printStackTrace(); + System.err.println("Failed to run bdb tools"); + return false; + } + return true; + } + + private void getValueByKey(Database db, Long key) + throws UnsupportedEncodingException { + + DatabaseEntry queryKey = new DatabaseEntry(); + TupleBinding myBinding = TupleBinding.getPrimitiveBinding(Long.class); + myBinding.objectToEntry(key, queryKey); + DatabaseEntry value = new DatabaseEntry(); + + OperationStatus status = db.get(null, queryKey, value, LockMode.READ_COMMITTED); + if (status == OperationStatus.SUCCESS) { + byte[] retData = value.getData(); + DataInputStream in = new DataInputStream(new ByteArrayInputStream(retData)); + JournalEntity entity = new JournalEntity(); + try { + entity.readFields(in); + } catch (Exception e) { + e.printStackTrace(); + System.err.println("Fail to read journal entity for key: " + key + ". reason: " + e.getMessage()); + System.exit(-1); + } + System.out.println("key: " + key); + System.out.println("op code: " + entity.getOpCode()); + System.out.println("value: " + entity.getData().toString()); + } else if (status == OperationStatus.NOTFOUND) { + System.out.println("key: " + key); + System.out.println("value: NOT FOUND"); + } + } +} diff --git a/fe/src/com/baidu/palo/journal/bdbje/BDBToolOptions.java b/fe/src/com/baidu/palo/journal/bdbje/BDBToolOptions.java new file mode 100644 index 0000000000..ec16f5a552 --- /dev/null +++ b/fe/src/com/baidu/palo/journal/bdbje/BDBToolOptions.java @@ -0,0 +1,72 @@ +package com.baidu.palo.journal.bdbje; + +import com.baidu.palo.common.FeConstants; + +import com.google.common.base.Strings; + +public class BDBToolOptions { + private boolean isListDbs; + private String dbName; + private boolean isDbStat; + private boolean hasFromKey; + private String fromKey; + private boolean hasEndKey; + private String endKey; + private int metaVersion; + + public BDBToolOptions(boolean isListDbs, String dbName, boolean isDbStat, + String fromKey, String endKey, int metaVersion) { + this.isListDbs = isListDbs; + this.dbName = dbName; + this.isDbStat = isDbStat; + this.fromKey = fromKey; + this.hasFromKey = !Strings.isNullOrEmpty(fromKey); + this.endKey = endKey; + this.hasEndKey = !Strings.isNullOrEmpty(endKey); + this.metaVersion = metaVersion == 0 ? FeConstants.meta_version : metaVersion; + } + + public boolean isListDbs() { + return isListDbs; + } + + public String getDbName() { + return dbName; + } + + public boolean isDbStat() { + return isDbStat; + } + + public boolean hasFromKey() { + return hasFromKey; + } + + public String getFromKey() { + return fromKey; + } + + public boolean hasEndKey() { + return hasEndKey; + } + + public String getEndKey() { + return endKey; + } + + public int getMetaVersion() { + return metaVersion; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("list bdb database: " + isListDbs).append("\n"); + sb.append("bdb database name: " + dbName).append("\n"); + sb.append("get bdb database stat: " + isDbStat).append("\n"); + sb.append("from key" + fromKey).append("\n"); + sb.append("end key: " + endKey).append("\n"); + sb.append("meta version: " + metaVersion).append("\n"); + return sb.toString(); + } +} diff --git a/fe/src/com/baidu/palo/load/Load.java b/fe/src/com/baidu/palo/load/Load.java index 478c761304..510c364197 100644 --- a/fe/src/com/baidu/palo/load/Load.java +++ b/fe/src/com/baidu/palo/load/Load.java @@ -2587,7 +2587,7 @@ public class Load { } private void checkHasRunningSyncDeleteJob(long partitionId, String partitionName) throws DdlException { - // check if there are syncronized delete job under going + // check if there are synchronized delete job under going readLock(); try { if (partitionUnderDelete.contains(partitionId)) { diff --git a/fe/src/com/baidu/palo/planner/AggregationNode.java b/fe/src/com/baidu/palo/planner/AggregationNode.java index 841f2e7d6e..52c7b850a0 100644 --- a/fe/src/com/baidu/palo/planner/AggregationNode.java +++ b/fe/src/com/baidu/palo/planner/AggregationNode.java @@ -55,6 +55,9 @@ public class AggregationNode extends PlanNode { // node is the root node of a distributed aggregation. private boolean needsFinalize; + // If true, use streaming preaggregation algorithm. Not valid if this is a merge agg. + private boolean useStreamingPreagg; + /** * Create an agg node that is not an intermediate node. * isIntermediate is true if it is a slave node in a 2-part agg plan. @@ -88,6 +91,16 @@ public class AggregationNode extends PlanNode { updateplanNodeName(); } + /** + * Sets this node as a preaggregation. Only valid to call this if it is not marked + * as a preaggregation + */ + public void setIsPreagg(PlannerContext ctx_) { + useStreamingPreagg = ctx_.getQueryOptions().isSetDisable_stream_preaggregations() + && !ctx_.getQueryOptions().disable_stream_preaggregations + && aggInfo.getGroupingExprs().size() > 0; + } + @Override public void setCompactData(boolean on) { this.compactData = on; @@ -244,15 +257,28 @@ public class AggregationNode extends PlanNode { aggregateFunctions, aggInfo.getIntermediateTupleId().asInt(), aggInfo.getOutputTupleId().asInt(), needsFinalize); + msg.agg_node.setUse_streaming_preaggregation(useStreamingPreagg); List groupingExprs = aggInfo.getGroupingExprs(); if (groupingExprs != null) { msg.agg_node.setGrouping_exprs(Expr.treesToThrift(groupingExprs)); } } + protected String getDisplayLabelDetail() { + if (useStreamingPreagg) { + return "STREAMING"; + } + return null; + } + @Override protected String getNodeExplainString(String detailPrefix, TExplainLevel detailLevel) { StringBuilder output = new StringBuilder(); + String nameDetail = getDisplayLabelDetail(); + if (nameDetail != null) { + output.append(detailPrefix + nameDetail + "\n"); + } + if (aggInfo.getAggregateExprs() != null && aggInfo.getMaterializedAggregateExprs().size() > 0) { output.append(detailPrefix + "output: ").append( getExplainString(aggInfo.getAggregateExprs()) + "\n"); diff --git a/fe/src/com/baidu/palo/planner/DistributedPlanner.java b/fe/src/com/baidu/palo/planner/DistributedPlanner.java index e1701785f8..544c9d1c15 100644 --- a/fe/src/com/baidu/palo/planner/DistributedPlanner.java +++ b/fe/src/com/baidu/palo/planner/DistributedPlanner.java @@ -527,21 +527,21 @@ public class DistributedPlanner { for (int i = 0; i < childFragments.size(); ++i) { PlanFragment childFragment = childFragments.get(i); /* if (childFragment.isPartitioned() && childFragment.getPlanRoot().getNumInstances() > 1) { - * // absorb the plan trees of all partitioned child fragments into unionNode - * unionNode.addChild(childFragment.getPlanRoot()); - * unionFragment.setFragmentInPlanTree(unionNode.getChild(i)); - * unionFragment.addChildren(childFragment.getChildren()); - * fragments.remove(childFragment); + * // absorb the plan trees of all partitioned child fragments into unionNode + * unionNode.addChild(childFragment.getPlanRoot()); + * unionFragment.setFragmentInPlanTree(unionNode.getChild(i)); + * unionFragment.addChildren(childFragment.getChildren()); + * fragments.remove(childFragment); * } else { - * // dummy entry for subsequent addition of the ExchangeNode - * unionNode.addChild(null); - * // Connect the unpartitioned child fragments to unionNode via a random exchange. - * connectChildFragment(unionNode, i, unionFragment, childFragment); - * childFragment.setOutputPartition(DataPartition.RANDOM); + * // dummy entry for subsequent addition of the ExchangeNode + * unionNode.addChild(null); + * // Connect the unpartitioned child fragments to unionNode via a random exchange. + * connectChildFragment(unionNode, i, unionFragment, childFragment); + * childFragment.setOutputPartition(DataPartition.RANDOM); * } */ - // UnionNode should't be absorbed by childFragment, because it reduce + // UnionNode should't be absorbed by childFragment, because it reduce // the degree of concurrency. // chenhao16 add // dummy entry for subsequent addition of the ExchangeNode @@ -695,8 +695,8 @@ public class DistributedPlanner { // and goes into a parent fragment childFragment.addPlanRoot(node); node.setIntermediateTuple(); - // TODO(zc) - // node.setIsPreagg(ctx_); + + node.setIsPreagg(ctx_); // if there is a limit, we need to transfer it from the pre-aggregation // node in the child fragment to the merge aggregation node in the parent @@ -788,8 +788,8 @@ public class DistributedPlanner { partitionExprs == null ? DataPartition.UNPARTITIONED : DataPartition.hashPartitioned(partitionExprs); // Convert the existing node to a preaggregation. AggregationNode preaggNode = (AggregationNode)node.getChild(0); - // TODO(zc) - // preaggNode.setIsPreagg(ctx_); + + preaggNode.setIsPreagg(ctx_); // place a merge aggregation step for the 1st phase in a new fragment mergeFragment = createParentFragment(childFragment, mergePartition); diff --git a/fe/src/com/baidu/palo/planner/PlanFragment.java b/fe/src/com/baidu/palo/planner/PlanFragment.java index ed1e8e822f..0ec301f902 100644 --- a/fe/src/com/baidu/palo/planner/PlanFragment.java +++ b/fe/src/com/baidu/palo/planner/PlanFragment.java @@ -181,6 +181,11 @@ public class PlanFragment extends TreeNode { result.setOutput_sink(sink.toThrift()); } result.setPartition(dataPartition.toThrift()); + + // TODO chenhao , calculated by cost + result.setMin_reservation_bytes(0); + result.setInitial_reservation_total_claims(0); + return result; } diff --git a/fe/src/com/baidu/palo/planner/Planner.java b/fe/src/com/baidu/palo/planner/Planner.java index c2119cb9a3..d5a0e0bf7b 100644 --- a/fe/src/com/baidu/palo/planner/Planner.java +++ b/fe/src/com/baidu/palo/planner/Planner.java @@ -141,8 +141,15 @@ public class Planner { singleNodePlanner = new SingleNodePlanner(plannerContext); PlanNode singleNodePlan = singleNodePlanner.createSingleNodePlan(); + singleNodePlanner.validatePlan(singleNodePlan); + List resultExprs = queryStmt.getResultExprs(); if (statment instanceof InsertStmt) { + if (queryOptions.isSetMt_dop() && queryOptions.mt_dop > 0) { + throw new NotImplementedException( + "MT_DOP not supported for plans with insert."); + } + InsertStmt insertStmt = (InsertStmt) statment; if (insertStmt.getOlapTuple() != null) { singleNodePlan = new OlapRewriteNode(plannerContext.getNextNodeId(), singleNodePlan, insertStmt); diff --git a/fe/src/com/baidu/palo/planner/SingleNodePlanner.java b/fe/src/com/baidu/palo/planner/SingleNodePlanner.java index 29c8c76274..a43a29ae96 100644 --- a/fe/src/com/baidu/palo/planner/SingleNodePlanner.java +++ b/fe/src/com/baidu/palo/planner/SingleNodePlanner.java @@ -34,6 +34,7 @@ import com.baidu.palo.analysis.FunctionCallExpr; import com.baidu.palo.analysis.InPredicate; import com.baidu.palo.analysis.InlineViewRef; import com.baidu.palo.analysis.IsNullPredicate; +import com.baidu.palo.analysis.JoinOperator; import com.baidu.palo.analysis.LiteralExpr; import com.baidu.palo.analysis.NullLiteral; import com.baidu.palo.analysis.QueryStmt; @@ -52,6 +53,7 @@ import com.baidu.palo.catalog.MysqlTable; import com.baidu.palo.catalog.Table; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.NotImplementedException; import com.baidu.palo.common.Pair; import com.baidu.palo.common.Reference; @@ -149,6 +151,31 @@ public class SingleNodePlanner { return singleNodePlan; } + /** + * Checks that the given single-node plan is executable: + * - It may not contain right or full outer joins with no equi-join conjuncts that + * are not inside the right child of a SubplanNode. + * - MT_DOP > 0 is not supported for plans with base table joins or table sinks. + * Throws a NotImplementedException if plan validation fails. + */ + public void validatePlan(PlanNode planNode) throws NotImplementedException { + if (ctx_.getQueryOptions().isSetMt_dop() && ctx_.getQueryOptions().mt_dop > 0 + && (planNode instanceof HashJoinNode || planNode instanceof CrossJoinNode)) { + throw new NotImplementedException( + "MT_DOP not supported for plans with base table joins or table sinks."); + } + + // As long as MT_DOP is unset or 0 any join can run in a single-node plan. + if (ctx_.isSingleNodeExec() && + (!ctx_.getQueryOptions().isSetMt_dop() || ctx_.getQueryOptions().mt_dop == 0)) { + return; + } + + for (PlanNode child : planNode.getChildren()) { + validatePlan(child); + } + } + /** * Creates an EmptyNode that 'materializes' the tuples of the given stmt. * Marks all collection-typed slots referenced in stmt as non-materialized because @@ -330,11 +357,14 @@ public class SingleNodePlanner { boolean aggTableValidate = true; if (selectStmt.getTableRefs().size() > 1) { for (int i = 1; i < selectStmt.getTableRefs().size(); ++i) { - if (selectStmt.getTableRefs().get(i).getJoinOp().isOuterJoin()) { - LOG.info(logStr + selectStmt.getTableRefs().get(i) + " joinOp is outer-join"); + final JoinOperator joinOperator = selectStmt.getTableRefs().get(i).getJoinOp(); + // TODO chenhao16 , right out join ? + if (joinOperator.isRightOuterJoin() || joinOperator.isFullOuterJoin()) { + LOG.info(logStr + selectStmt.getTableRefs().get(i) + + " joinOp is full outer join or right outer join."); aggTableValidate = false; break; - } + } } if (!aggTableValidate) { break; diff --git a/fe/src/com/baidu/palo/qe/Coordinator.java b/fe/src/com/baidu/palo/qe/Coordinator.java index 32b70e16eb..4005266f96 100644 --- a/fe/src/com/baidu/palo/qe/Coordinator.java +++ b/fe/src/com/baidu/palo/qe/Coordinator.java @@ -27,7 +27,10 @@ import com.baidu.palo.common.util.DebugUtil; import com.baidu.palo.common.util.RuntimeProfile; import com.baidu.palo.planner.DataPartition; import com.baidu.palo.planner.DataSink; +import com.baidu.palo.planner.DataStreamSink; import com.baidu.palo.planner.ExchangeNode; +import com.baidu.palo.planner.MysqlScanNode; +import com.baidu.palo.planner.OlapScanNode; import com.baidu.palo.planner.PlanFragment; import com.baidu.palo.planner.PlanFragmentId; import com.baidu.palo.planner.PlanNode; @@ -40,6 +43,7 @@ import com.baidu.palo.service.FrontendOptions; import com.baidu.palo.system.Backend; import com.baidu.palo.task.LoadEtlTask; import com.baidu.palo.thrift.BackendService; +import com.baidu.palo.thrift.PaloInternalServiceConstants; import com.baidu.palo.thrift.PaloInternalServiceVersion; import com.baidu.palo.thrift.TCancelPlanFragmentParams; import com.baidu.palo.thrift.TCancelPlanFragmentResult; @@ -48,8 +52,12 @@ import com.baidu.palo.thrift.TExecPlanFragmentParams; import com.baidu.palo.thrift.TExecPlanFragmentResult; import com.baidu.palo.thrift.TNetworkAddress; import com.baidu.palo.thrift.TPaloScanRange; +import com.baidu.palo.thrift.TPartitionType; +import com.baidu.palo.thrift.TPlan; import com.baidu.palo.thrift.TPlanFragmentDestination; import com.baidu.palo.thrift.TPlanFragmentExecParams; +import com.baidu.palo.thrift.TPlanNode; +import com.baidu.palo.thrift.TPlanNodeType; import com.baidu.palo.thrift.TQueryGlobals; import com.baidu.palo.thrift.TQueryOptions; import com.baidu.palo.thrift.TQueryType; @@ -68,6 +76,7 @@ import com.google.common.base.Strings; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Lists; import com.google.common.collect.Maps; +import com.google.common.collect.Sets; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -77,11 +86,14 @@ import org.apache.thrift.transport.TTransportException; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; import java.util.Date; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.concurrent.ConcurrentMap; import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutorService; @@ -129,13 +141,9 @@ public class Coordinator { private List fragmentProfile; // populated in computeFragmentExecParams() - private Map fragmentExecParams = Maps.newHashMap(); + private Map fragmentExecParamsMap = Maps.newHashMap(); private List fragments; - // vector is indexed by fragment index from TQueryExecRequest.fragments; - // populated in computeScanRangeAssignment() - private Map scanRangeAssignment = - Maps.newHashMap(); // backend execute state private List backendExecStates = Lists.newArrayList(); private ResultReceiver receiver; @@ -167,6 +175,8 @@ public class Coordinator { private boolean needReport; private String clusterName; + // paralle execute + private final TUniqueId nextInstanceId; // Used for query public Coordinator(ConnectContext context, Analyzer analyzer, Planner planner) { @@ -182,6 +192,9 @@ public class Coordinator { context.getSessionVariable().getResourceGroup()); this.needReport = context.getSessionVariable().isReportSucc(); this.clusterName = context.getClusterName(); + this.nextInstanceId = new TUniqueId(); + nextInstanceId.setHi(queryId.hi); + nextInstanceId.setLo(queryId.lo + 1); } // Used for pull load task coordinator @@ -197,6 +210,9 @@ public class Coordinator { this.tResourceInfo = new TResourceInfo("", ""); this.needReport = true; this.clusterName = cluster; + this.nextInstanceId = new TUniqueId(); + nextInstanceId.setHi(queryId.hi); + nextInstanceId.setLo(queryId.lo + 1); } public TUniqueId getQueryId() { @@ -256,12 +272,21 @@ public class Coordinator { // Initiate private void prepare() { + for (PlanFragment fragment : fragments) { - // resize scan range assigment - scanRangeAssignment.put(fragment.getFragmentId(), new FragmentScanRangeAssignment()); - // resize fragment execute parameters - fragmentExecParams.put(fragment.getFragmentId(), new FragmentExecParams(fragment)); + fragmentExecParamsMap.put(fragment.getFragmentId(), new FragmentExecParams(fragment)); } + + // set inputFragments + for (PlanFragment fragment : fragments) { + if (!(fragment.getSink() instanceof DataStreamSink)) { + continue; + } + FragmentExecParams params = fragmentExecParamsMap.get(fragment.getDestFragment().getFragmentId()); + params.inputFragments.add(fragment.getFragmentId()); + + } + coordAddress = new TNetworkAddress(localIP, Config.rpc_port); int fragmentSize = fragments.size(); @@ -298,7 +323,7 @@ public class Coordinator { int idx = 0; sb.append("id=").append(DebugUtil.printId(queryId)).append(","); sb.append("fragment=["); - for (Map.Entry entry : fragmentExecParams.entrySet()) { + for (Map.Entry entry : fragmentExecParamsMap.entrySet()) { if (idx++ != 0) { sb.append(","); } @@ -354,17 +379,24 @@ public class Coordinator { prepare(); // compute Fragment Instance computeScanRangeAssignment(); - computeFragmentExecParams(); + + // if mt_dop <= 1 + if (queryOptions.mt_dop <= 1) { + computeFragmentExecParams(); + } else { + computeFragmentExecParamsForParallelExec(); + validate(); + } traceInstance(); // create result receiver PlanFragmentId topId = fragments.get(0).getFragmentId(); - FragmentExecParams topParams = fragmentExecParams.get(topId); + FragmentExecParams topParams = fragmentExecParamsMap.get(topId); if (topParams.fragment.getSink() instanceof ResultSink) { TPlanFragmentDestination rootSource = new TPlanFragmentDestination(); - rootSource.fragment_instance_id = topParams.instanceIds.get(0); - rootSource.server = topParams.hosts.get(0); + rootSource.fragment_instance_id = topParams.instanceExecParams.get(0).instanceId; + rootSource.server = topParams.instanceExecParams.get(0).host; receiver = new ResultReceiver(rootSource, addressToBackendID.get(rootSource.server), queryOptions.query_timeout * 1000); } else { @@ -384,11 +416,11 @@ public class Coordinator { int backendId = 0; int profileFragmentId = 0; for (PlanFragment fragment : fragments) { - FragmentExecParams params = fragmentExecParams.get(fragment.getFragmentId()); - + FragmentExecParams params = fragmentExecParamsMap.get(fragment.getFragmentId()); + // set up exec states - int numHosts = params.hosts.size(); - Preconditions.checkState(numHosts > 0); + int instanceNum = params.instanceExecParams.size(); + Preconditions.checkState(instanceNum > 0); List tParams = params.toThrift(backendId); int instanceId = 0; for (TExecPlanFragmentParams tParam : tParams) { @@ -402,7 +434,7 @@ public class Coordinator { } // Issue all rpcs in parallel ExecStatus status = new ExecStatus(); - ParallelExecutor.exec(backendExecStates, backendId - numHosts, numHosts, status); + ParallelExecutor.exec(backendExecStates, backendId - instanceNum, instanceNum, status); if (status.getErrCode() != TStatusCode.OK) { String errMsg = "exec rpc error"; queryStatus.setStatus(errMsg); @@ -666,22 +698,22 @@ public class Coordinator { } } - // fill all the fields in fragmentExecParams - private void computeFragmentExecParams() throws Exception { + + private void computeFragmentExecParams() throws Exception { // fill hosts field in fragmentExecParams computeFragmentHosts(); // assign instance ids numBackends = 0; - for (FragmentExecParams params : fragmentExecParams.values()) { - LOG.debug("parameter has hosts.{}", params.hosts.size()); - for (int j = 0; j < params.hosts.size(); ++j) { + for (FragmentExecParams params : fragmentExecParamsMap.values()) { + LOG.debug("parameter has instances.{}", params.instanceExecParams.size()); + for (int j = 0; j < params.instanceExecParams.size(); ++j) { // we add instance_num to query_id.lo to create a // globally-unique instance id TUniqueId instanceId = new TUniqueId(); instanceId.setHi(queryId.hi); instanceId.setLo(queryId.lo + numBackends + 1); - params.instanceIds.add(instanceId); + params.instanceExecParams.get(j).instanceId = instanceId; numBackends++; } @@ -689,13 +721,13 @@ public class Coordinator { // compute destinations and # senders per exchange node // (the root fragment doesn't have a destination) - for (FragmentExecParams params : fragmentExecParams.values()) { + for (FragmentExecParams params : fragmentExecParamsMap.values()) { PlanFragment destFragment = params.fragment.getDestFragment(); if (destFragment == null) { // root plan fragment continue; } - FragmentExecParams destParams = fragmentExecParams.get(destFragment.getFragmentId()); + FragmentExecParams destParams = fragmentExecParamsMap.get(destFragment.getFragmentId()); // set # of senders DataSink sink = params.fragment.getSink(); @@ -707,17 +739,18 @@ public class Coordinator { // we might have multiple fragments sending to this exchange node // (distributed MERGE), which is why we need to add up the #senders if (destParams.perExchNumSenders.get(exchId.asInt()) == null) { - destParams.perExchNumSenders.put(exchId.asInt(), params.hosts.size()); + destParams.perExchNumSenders.put(exchId.asInt(), params.instanceExecParams.size()); } else { destParams.perExchNumSenders.put(exchId.asInt(), - params.hosts.size() + destParams.perExchNumSenders.get(exchId.asInt())); + params.instanceExecParams.size() + destParams.perExchNumSenders.get(exchId.asInt())); } // add destination host to this fragment's destination - for (int j = 0; j < destParams.hosts.size(); ++j) { + for (int j = 0; j < destParams.instanceExecParams.size(); ++j) { TPlanFragmentDestination dest = new TPlanFragmentDestination(); - dest.fragment_instance_id = destParams.instanceIds.get(j); - dest.server = toRpcHost(destParams.hosts.get(j)); + dest.fragment_instance_id = destParams.instanceExecParams.get(j).instanceId; + dest.server = toRpcHost(destParams.instanceExecParams.get(j).host); + dest.setBrpc_server(toBrpcHost(destParams.instanceExecParams.get(j).host)); params.destinations.add(dest); } } @@ -733,6 +766,18 @@ public class Coordinator { return dest; } + private TNetworkAddress toBrpcHost(TNetworkAddress host) throws Exception { + Backend backend = Catalog.getCurrentSystemInfo().getBackendWithBePort( + host.getHostname(), host.getPort()); + if (backend == null) { + throw new InternalException("there is no scanNode Backend"); + } + if (backend.getBrpcPort() < 0) { + return null; + } + return new TNetworkAddress(backend.getHost(), backend.getBrpcPort()); + } + // estimate if this fragment contains UnionNode private boolean containsUnionNode(PlanNode node) { if (node instanceof UnionNode) { @@ -760,7 +805,7 @@ public class Coordinator { // compute hosts *bottom up*. for (int i = fragments.size() - 1; i >= 0; --i) { PlanFragment fragment = fragments.get(i); - FragmentExecParams params = fragmentExecParams.get(fragment.getFragmentId()); + FragmentExecParams params = fragmentExecParamsMap.get(fragment.getFragmentId()); if (fragment.getDataPartition() == DataPartition.UNPARTITIONED) { Reference backendIdRef = new Reference(); @@ -770,8 +815,9 @@ public class Coordinator { throw new InternalException("there is no scanNode Backend"); } this.addressToBackendID.put(execHostport, backendIdRef.getRef()); - - params.hosts.add(execHostport); + FInstanceExecParam instanceParam = new FInstanceExecParam(null, execHostport, + 0, params); + params.instanceExecParams.add(instanceParam); continue; } @@ -788,30 +834,338 @@ public class Coordinator { PlanFragmentId inputFragmentIdx = fragments.get(i).getChild(0).getFragmentId(); // AddAll() soft copy() - params.hosts.addAll(fragmentExecParams.get(inputFragmentIdx).hosts); + for (FInstanceExecParam execParams + : fragmentExecParamsMap.get(inputFragmentIdx).instanceExecParams) { + FInstanceExecParam instanceParam = new FInstanceExecParam(null, execParams.host, + 0, params); + params.instanceExecParams.add(instanceParam); + } + // TODO: switch to unpartitioned/coord execution if our input fragment // is executed that way (could have been downgraded from distributed) continue; } - Iterator iter = scanRangeAssignment.get(fragment.getFragmentId()).entrySet().iterator(); + Iterator iter = fragmentExecParamsMap.get(fragment.getFragmentId()) + .scanRangeAssignment.entrySet().iterator(); while (iter.hasNext()) { Map.Entry entry = (Map.Entry) iter.next(); TNetworkAddress key = (TNetworkAddress) entry.getKey(); - params.hosts.add(key); + Map> value = + (Map>) entry.getValue(); + FInstanceExecParam instanceParam = new FInstanceExecParam(null, key, + 0, params); + for (Integer planNodeId : value.keySet()) { + instanceParam.perNodeScanRanges.put(planNodeId, value.get(planNodeId)); + } + params.instanceExecParams.add(instanceParam); } - if (params.hosts.isEmpty()) { + + if (params.instanceExecParams.isEmpty()) { Reference backendIdRef = new Reference(); TNetworkAddress execHostport = SimpleScheduler.getHost(this.idToBackend, backendIdRef); if (execHostport == null) { throw new InternalException("there is no scanNode Backend"); } this.addressToBackendID.put(execHostport, backendIdRef.getRef()); - - params.hosts.add(execHostport); + FInstanceExecParam instanceParam = new FInstanceExecParam(null, execHostport, + 0, params); + params.instanceExecParams.add(instanceParam); } } } + + + private void computeFragmentExecParamsForParallelExec() throws Exception { + // create exec params and set instance_id, host, per_node_scan_ranges + computeFragmentInstances(fragmentExecParamsMap.get(fragments.get(0).getFragmentId())); + + // Set destinations, per_exch_num_senders, sender_id. + for (PlanFragment srcFragment : fragments) { + if (!(srcFragment.getSink() instanceof DataStreamSink)) { + continue; + } + final PlanFragmentId desFragmentId = srcFragment.getDestFragment().getFragmentId(); + final FragmentExecParams srcParams = fragmentExecParamsMap.get(srcFragment.getFragmentId()); + final FragmentExecParams destParams = fragmentExecParamsMap.get(desFragmentId); + + // populate src_params->destinations + for (int i = 0; i < destParams.instanceExecParams.size(); i++) { + TPlanFragmentDestination dest = new TPlanFragmentDestination(); + dest.setFragment_instance_id(destParams.instanceExecParams.get(i).instanceId); + dest.setServer(toRpcHost(destParams.instanceExecParams.get(i).host)); + dest.setBrpc_server(toBrpcHost(destParams.instanceExecParams.get(i).host)); + srcParams.destinations.add(dest); + } + + final DataSink sinker = srcFragment.getSink(); + Preconditions.checkState( + sinker.getOutputPartition().getType() == TPartitionType.HASH_PARTITIONED + || sinker.getOutputPartition().getType() == TPartitionType.UNPARTITIONED + || sinker.getOutputPartition().getType() == TPartitionType.RANDOM); + + PlanNodeId exchId = sinker.getExchNodeId(); + Integer senderIdBase = destParams.perExchNumSenders.get(exchId); + if (senderIdBase == null) { + destParams.perExchNumSenders.put(exchId.asInt(), srcParams.instanceExecParams.size()); + senderIdBase = 0; + } else { + destParams.perExchNumSenders.put(exchId.asInt(), + senderIdBase + srcParams.instanceExecParams.size()); + } + + for (int i = 0; i < srcParams.instanceExecParams.size(); i++) { + FInstanceExecParam srcInstanceParam = srcParams.instanceExecParams.get(i); + srcInstanceParam.senderId = senderIdBase + i; + } + } + } + + // compute instances from fragment + private void computeFragmentInstances(FragmentExecParams params) throws Exception { + // // traverse input fragments + for (PlanFragmentId fragmentId : params.inputFragments) { + computeFragmentInstances(fragmentExecParamsMap.get(fragmentId)); + } + + // case 1: single instance executed at coordinator + final PlanFragment fragment = params.fragment; + if (fragment.getDataPartition() == DataPartition.UNPARTITIONED) { + Reference backendIdRef = new Reference(); + TNetworkAddress execHostport = SimpleScheduler.getHost(this.idToBackend, backendIdRef); + if (execHostport == null) { + LOG.warn("DataPartition UNPARTITIONED, no scanNode Backend"); + throw new InternalException("there is no scanNode Backend"); + } + TUniqueId instanceId = getNextInstanceId(); + FInstanceExecParam instanceParam = new FInstanceExecParam(instanceId, execHostport, + 0, params); + params.instanceExecParams.add(instanceParam); + this.addressToBackendID.put(execHostport, backendIdRef.getRef()); + return; + } + + if (containsUnionNode(fragment.getPlanRoot())) { + createUnionInstance(params); + return; + } + + PlanNode leftPlanNode = findLeftmostNode(fragment.getPlanRoot()); + if (leftPlanNode instanceof MysqlScanNode + || leftPlanNode instanceof OlapScanNode) { + // case 2: leaf fragment with leftmost scan + // TODO: check that there's only one scan in this fragment + createScanInstance(leftPlanNode.getId(), params); + } else { + // case 3: interior fragment without leftmost scan + // we assign the same hosts as those of our leftmost input fragment (so that a + // merge aggregation fragment runs on the hosts that provide the input data) + createCollocatedInstance(params); + } + } + + private List findScanNodes(PlanNode plan) { + List result = Lists.newArrayList(); + List nodeList = Lists.newArrayList(); + getAllNodes(plan, nodeList); + for (PlanNode node : nodeList) { + if (node instanceof MysqlScanNode + || node instanceof OlapScanNode) { + result.add(node.getId()); + } + } + return result; + } + + private void getAllNodes(PlanNode plan, List nodeList) { + if (plan.getChildren().size() > 0) { + nodeList.addAll(plan.getChildren()); + for (PlanNode child : plan.getChildren()) { + getAllNodes(child, nodeList); + } + } + nodeList.add(plan); + } + + private Set getScanHosts(PlanNodeId id, FragmentExecParams fragmentExecParams) { + Set result = Sets.newHashSet(); + for (TNetworkAddress host : fragmentExecParams.scanRangeAssignment.keySet()) { + Map> planNodeToScanRangeParams + = fragmentExecParams.scanRangeAssignment.get(host); + for (Integer planNodeId : planNodeToScanRangeParams.keySet()) { + if (id.asInt() == planNodeId) { + result.add(host); + } + } + } + + return result; + } + + public void createScanInstance(PlanNodeId leftMostScanId, FragmentExecParams fragmentExecParams) + throws InternalException { + int maxNumInstance = queryOptions.mt_dop; + if (maxNumInstance == 0) { + maxNumInstance = 1; + } + + if (fragmentExecParams.scanRangeAssignment.isEmpty()) { + // this scan doesn't have any scan ranges: run a single instance on the random backend + Reference backendIdRef = new Reference(); + TNetworkAddress execHostport = SimpleScheduler.getHost(this.idToBackend, backendIdRef); + if (execHostport == null) { + throw new InternalException("there is no scanNode Backend"); + } + FInstanceExecParam instanceParam = new FInstanceExecParam(getNextInstanceId(), execHostport, 0, + fragmentExecParams); + fragmentExecParams.instanceExecParams.add(instanceParam); + return; + } + + final int leftMostScanIdInt = leftMostScanId.asInt(); + int perFragmentInstanceIdx = 0; + for (TNetworkAddress host : fragmentExecParams.scanRangeAssignment.keySet()) { + // evenly divide up the scan ranges of the leftmost scan between at most + // instances + final Map> scanMap = fragmentExecParams.scanRangeAssignment.get(host); + final List scanRangesList = scanMap.get(leftMostScanIdInt); + Preconditions.checkState(scanRangesList != null); + // try to load-balance scan ranges by assigning just beyond the average number of + // bytes to each instance + // TODO: fix shortcomings introduced by uneven split sizes, + // this could end up assigning 0 scan ranges to an instance + final int numInstance = Math.min(maxNumInstance, scanRangesList.size()); + Preconditions.checkState(numInstance != 0); + final List perHostInstanceExecParams = Lists.newArrayList(); + // create FInstanceExecParam in one host + for (int i = 0; i < numInstance; i++) { + final FInstanceExecParam instanceParam = new FInstanceExecParam(getNextInstanceId(), + host, perFragmentInstanceIdx++, fragmentExecParams); + fragmentExecParams.instanceExecParams.add(instanceParam); + perHostInstanceExecParams.add(instanceParam); + List paramList = instanceParam.perNodeScanRanges.get(leftMostScanIdInt); + if (paramList == null) { + paramList = Lists.newArrayList(); + instanceParam.perNodeScanRanges.put(leftMostScanIdInt, paramList); + } + } + + // assign tablet + Collections.shuffle(scanRangesList); + for (int i = 0; i < scanRangesList.size(); i++) { + final TScanRangeParams scanRangeParams = scanRangesList.get(i); + final int position = i % numInstance; + perHostInstanceExecParams.get(position).perNodeScanRanges.get(leftMostScanIdInt).add(scanRangeParams); + } + } + } + + private void validate() { + int numFragments = 0; + for (PlanFragment fragment : fragments) { + // TODO chenhao fragment' id produced in palo may larger than fragment sizes, + // need to update this after merge latest impala plan codes + //Preconditions.checkState(fragment.getFragmentId().asInt() <= fragments.size()); + Preconditions.checkState(fragment.getFragmentId() + == fragmentExecParamsMap.get(fragment.getFragmentId()).fragment.getFragmentId()); + ++numFragments; + } + + Preconditions.checkState(numFragments == fragmentExecParamsMap.size()); + + // we assigned the correct number of scan ranges per (host, node id): + // assemble a map from host -> (map from node id -> #scan ranges) + Map> countMap = Maps.newHashMap(); + for (FragmentExecParams fragmentExecParams : fragmentExecParamsMap.values()) { + for (FInstanceExecParam instanceExecParam : fragmentExecParams.instanceExecParams) { + Map planNodeIdToCount = countMap.get(instanceExecParam.host); + if (planNodeIdToCount == null) { + planNodeIdToCount = Maps.newHashMap(); + countMap.put(instanceExecParam.host, planNodeIdToCount); + } + + for (Integer planNodeId : instanceExecParam.perNodeScanRanges.keySet()) { + Integer count = planNodeIdToCount.get(planNodeId); + if (count == null) { + planNodeIdToCount.put(planNodeId, 0); + count = 0; + } + int lastCount = planNodeIdToCount.get(planNodeId); + planNodeIdToCount.put(planNodeId, lastCount + + instanceExecParam.perNodeScanRanges.get(planNodeId).size()); + } + } + } + + for (FragmentExecParams fragmentExecParams : fragmentExecParamsMap.values()) { + for (TNetworkAddress host : fragmentExecParams.scanRangeAssignment.keySet()) { + Preconditions.checkState(countMap.get(host).size() != 0); + final Map nodeCountMap = countMap.get(host); + Map> planNodeIdToScanRangeList + = fragmentExecParams.scanRangeAssignment.get(host); + for (Integer planNodeId : planNodeIdToScanRangeList.keySet()) { + Preconditions.checkState(nodeCountMap.get(planNodeId) > 0); + Preconditions.checkState(nodeCountMap.get(planNodeId) + == planNodeIdToScanRangeList.get(planNodeId).size()); + } + } + } + // TODO: add validation for BackendExecParams + } + + // create collocated instance according to inputFragments + public void createCollocatedInstance(FragmentExecParams fragmentExecParams) { + Preconditions.checkState(fragmentExecParams.inputFragments.size() >= 1); + final FragmentExecParams inputFragmentParams = fragmentExecParamsMap.get(fragmentExecParams. + inputFragments.get(0)); + int perFragmentInstanceIdx = 0; + for (FInstanceExecParam inputInstanceParams : inputFragmentParams.instanceExecParams) { + FInstanceExecParam instanceParam = new FInstanceExecParam(getNextInstanceId(), + inputInstanceParams.host, perFragmentInstanceIdx++, fragmentExecParams); + fragmentExecParams.instanceExecParams.add(instanceParam); + } + } + + private TUniqueId getNextInstanceId() { + TUniqueId result = nextInstanceId.deepCopy(); + nextInstanceId.lo++; + return result; + } + + + public void createUnionInstance(FragmentExecParams fragmentExecParams) { + final PlanFragment fragment = fragmentExecParams.fragment; + // Add hosts of scan nodes + List scanNodeIds = findScanNodes(fragment.getPlanRoot()); + + Set hostsSets = Sets.newHashSet(); + for(PlanNodeId id: scanNodeIds) { + hostsSets.addAll(getScanHosts(id, fragmentExecParams)); + } + + // UnionNode's child is not ScanNode + for (PlanFragmentId inputFragmentId : fragmentExecParams.inputFragments) { + FragmentExecParams inputeExecParams = fragmentExecParamsMap.get(inputFragmentId); + for (FInstanceExecParam instanceParam : inputeExecParams.instanceExecParams) { + hostsSets.add(instanceParam.host); + } + } + + // create a single instance per host + // TODO-MT: figure out how to parallelize Union + int perFragmentIdx = 0; + for (TNetworkAddress host : hostsSets) { + FInstanceExecParam instanceParam = new FInstanceExecParam(getNextInstanceId(), host, + perFragmentIdx++, fragmentExecParams); + // assign all scan ranges + fragmentExecParams.instanceExecParams.add(instanceParam); + if (fragmentExecParams.scanRangeAssignment.get(host) != null + && fragmentExecParams.scanRangeAssignment.get(host).size() > 0) { + instanceParam.perNodeScanRanges = fragmentExecParams.scanRangeAssignment.get(host); + } + } + } + // Returns the id of the leftmost node of any of the gives types in 'plan_root', // or INVALID_PLAN_NODE_ID if no such node present. @@ -860,7 +1214,7 @@ public class Coordinator { } FragmentScanRangeAssignment assignment = - scanRangeAssignment.get(scanNode.getFragmentId()); + fragmentExecParamsMap.get(scanNode.getFragmentId()).scanRangeAssignment; computeScanRangeAssignment(scanNode.getId(), locations, assignment); } } @@ -1065,15 +1419,15 @@ public class Coordinator { this.rpcParams = rpcParams; this.initiated = false; this.done = false; - String name = "Instance " + DebugUtil.printId(fragmentExecParams.get(fragmentId) - .instanceIds.get(instanceId)) + " (host=" + getBackendAddress() + ")"; + String name = "Instance " + DebugUtil.printId(fragmentExecParamsMap.get(fragmentId) + .instanceExecParams.get(instanceId).instanceId) + " (host=" + getBackendAddress() + ")"; this.profile = new RuntimeProfile(name); this.hasCanceled = false; this.addressToBackendID = addressToBackendID; } - public final TNetworkAddress getBackendAddress() { - return fragmentExecParams.get(fragmentId).hosts.get(instanceId); + public TNetworkAddress getBackendAddress() { + return fragmentExecParamsMap.get(fragmentId).instanceExecParams.get(instanceId).host; } public TUniqueId getFragmentInstanceId() { @@ -1151,16 +1505,18 @@ public class Coordinator { } } - // execution parameters for a single fragment; used to assemble the - // per-fragment instance TPlanFragmentExecParams; - // hosts.size() == instance_ids.size() + // execution parameters for a single fragment, + // per-fragment can have multiple FInstanceExecParam, + // used to assemble TPlanFragmentExecParas protected class FragmentExecParams { public PlanFragment fragment; - public List hosts = Lists.newArrayList(); - public List instanceIds = Lists.newArrayList(); public List destinations = Lists.newArrayList(); public Map perExchNumSenders = Maps.newHashMap(); - + + public List inputFragments = Lists.newArrayList(); + public List instanceExecParams = Lists.newArrayList(); + public FragmentScanRangeAssignment scanRangeAssignment = new FragmentScanRangeAssignment(); + public FragmentExecParams(PlanFragment fragment) { this.fragment = fragment; } @@ -1168,8 +1524,8 @@ public class Coordinator { List toThrift(int backendNum) { List paramsList = Lists.newArrayList(); - int tmpBackendNum = backendNum; - for (int i = 0; i < instanceIds.size(); ++i) { + for (int i = 0; i < instanceExecParams.size(); ++i) { + final FInstanceExecParam instanceExecParam = instanceExecParams.get(i); TExecPlanFragmentParams params = new TExecPlanFragmentParams(); params.setProtocol_version(PaloInternalServiceVersion.V1); params.setFragment(fragment.toThrift()); @@ -1177,20 +1533,20 @@ public class Coordinator { params.setParams(new TPlanFragmentExecParams()); params.setResource_info(tResourceInfo); params.params.setQuery_id(queryId); - params.params.setFragment_instance_id(instanceIds.get(i)); - TNetworkAddress address = fragmentExecParams.get(fragment.getFragmentId()).hosts.get(i); - - Map> scanRanges = - scanRangeAssignment.get(fragment.getFragmentId()).get(address); + params.params.setFragment_instance_id(instanceExecParam.instanceId); + + Map> scanRanges = instanceExecParam.perNodeScanRanges; if (scanRanges == null) { scanRanges = Maps.newHashMap(); } + params.params.setPer_node_scan_ranges(scanRanges); params.params.setPer_exch_num_senders(perExchNumSenders); + params.params.setDestinations(destinations); params.params.setSender_id(i); params.setCoord(coordAddress); - params.setBackend_num(tmpBackendNum++); + params.setBackend_num(backendNum++); params.setQuery_globals(queryGlobals); params.setQuery_options(queryOptions); @@ -1223,16 +1579,16 @@ public class Coordinator { fragment.getPlanRoot().appendTrace(sb); sb.append(",instance=["); // append instance - for (int i = 0; i < instanceIds.size(); ++i) { + for (int i = 0; i < instanceExecParams.size(); ++i) { if (i != 0) { sb.append(","); } - TNetworkAddress address = fragmentExecParams.get(fragment.getFragmentId()).hosts.get(i); + TNetworkAddress address = instanceExecParams.get(i).host; Map> scanRanges = - scanRangeAssignment.get(fragment.getFragmentId()).get(address); + scanRangeAssignment.get(address); sb.append("{"); - sb.append("id=").append(DebugUtil.printId(instanceIds.get(i))); - sb.append(",host=").append(hosts.get(i).getHostname()); + sb.append("id=").append(DebugUtil.printId(instanceExecParams.get(i).instanceId)); + sb.append(",host=").append(instanceExecParams.get(i).host); if (scanRanges == null) { sb.append("}"); continue; @@ -1254,6 +1610,32 @@ public class Coordinator { } } + // fragment instance exec param, it is used to assemble + // the per-instance TPlanFragmentExecParas, as a member of + // FragmentExecParams + static class FInstanceExecParam { + TUniqueId instanceId; + TNetworkAddress host; + Map> perNodeScanRanges = Maps.newHashMap(); + + int perFragmentInstanceIdx; + int senderId; + + FragmentExecParams fragmentExecParams; + + public FInstanceExecParam(TUniqueId id, TNetworkAddress host, + int perFragmentInstanceIdx, FragmentExecParams fragmentExecParams) { + this.instanceId = id; + this.host = host; + this.perFragmentInstanceIdx = perFragmentInstanceIdx; + this.fragmentExecParams = fragmentExecParams; + } + + public PlanFragment fragment() { + return fragmentExecParams.fragment; + } + } + private static class ParallelExecutor { private static final ExecutorService EXECUTOR = Executors.newCachedThreadPool(); diff --git a/fe/src/com/baidu/palo/qe/SessionVariable.java b/fe/src/com/baidu/palo/qe/SessionVariable.java index 981a113299..4e9cbefffd 100644 --- a/fe/src/com/baidu/palo/qe/SessionVariable.java +++ b/fe/src/com/baidu/palo/qe/SessionVariable.java @@ -15,18 +15,19 @@ package com.baidu.palo.qe; -import com.baidu.palo.thrift.TQueryOptions; -import com.baidu.palo.common.io.Writable; import com.baidu.palo.catalog.Catalog; import com.baidu.palo.common.FeMetaVersion; import com.baidu.palo.common.io.Text; +import com.baidu.palo.common.io.Writable; +import com.baidu.palo.thrift.TQueryOptions; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; -import java.io.Serializable; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; +import java.io.Serializable; // System variable public class SessionVariable implements Serializable, Writable { @@ -59,7 +60,12 @@ public class SessionVariable implements Serializable, Writable { public static final String SQL_SAFE_UPDATES = "sql_safe_updates"; public static final String NET_BUFFER_LENGTH = "net_buffer_length"; public static final String CODEGEN_LEVEL = "codegen_level"; - + // mem limit can't smaller than bufferpool's default page size + public static final int MIN_EXEC_MEM_LIMIT = 2097152; + public static final String BATCH_SIZE = "batch_size"; + public static final String DISABLE_STREAMING_PREAGGREGATIONS = "disable_streaming_preaggregations"; + public static final String MT_DOP = "mt_dop"; + // max memory used on every backend. @VariableMgr.VarAttr(name = EXEC_MEM_LIMIT) public long maxExecMemByte = 2147483648L; @@ -154,7 +160,17 @@ public class SessionVariable implements Serializable, Writable { // if true, need report to coordinator when plan fragment execute successfully. @VariableMgr.VarAttr(name = CODEGEN_LEVEL) - private int codegenLevel = 0; + private int codegenLevel = 0; + + // multithreaded degree of intra-node parallelism + @VariableMgr.VarAttr(name = MT_DOP) + private int mtDop = 0; + + @VariableMgr.VarAttr(name = BATCH_SIZE) + private int batchSize = 1024; + + @VariableMgr.VarAttr(name = DISABLE_STREAMING_PREAGGREGATIONS) + private boolean disableStreamPreaggregations = false; public long getMaxExecMemByte() { return maxExecMemByte; @@ -357,7 +373,11 @@ public class SessionVariable implements Serializable, Writable { } public void setMaxExecMemByte(long maxExecMemByte) { - this.maxExecMemByte = maxExecMemByte; + if (maxExecMemByte < MIN_EXEC_MEM_LIMIT) { + this.maxExecMemByte = MIN_EXEC_MEM_LIMIT; + } else { + this.maxExecMemByte = maxExecMemByte; + } } public void setQueryTimeoutS(int queryTimeoutS) { @@ -376,13 +396,32 @@ public class SessionVariable implements Serializable, Writable { this.resourceGroup = resourceGroup; } + public int getMtDop() { + return this.mtDop; + } + + public void setMtDop(int mtDop) { + this.mtDop = mtDop; + } + // Serialize to thrift object TQueryOptions toThrift() { TQueryOptions tResult = new TQueryOptions(); tResult.setMem_limit(maxExecMemByte); + + // TODO chenhao, reservation will be calculated by cost + tResult.setMin_reservation(0); + tResult.setMax_reservation(maxExecMemByte); + tResult.setInitial_reservation_total_claims(maxExecMemByte); + tResult.setBuffer_pool_limit(maxExecMemByte); + tResult.setQuery_timeout(queryTimeoutS); tResult.setIs_report_success(isReportSucc); tResult.setCodegen_level(codegenLevel); + + tResult.setBatch_size(batchSize); + tResult.setDisable_stream_preaggregations(disableStreamPreaggregations); + tResult.setMt_dop(mtDop); return tResult; } @@ -415,6 +454,9 @@ public class SessionVariable implements Serializable, Writable { out.writeInt(queryTimeoutS); out.writeLong(maxExecMemByte); Text.writeString(out, collationServer); + out.writeInt(batchSize); + out.writeBoolean(disableStreamPreaggregations); + out.writeInt(mtDop); } @Override @@ -446,7 +488,12 @@ public class SessionVariable implements Serializable, Writable { queryTimeoutS = in.readInt(); maxExecMemByte = in.readLong(); if (Catalog.getCurrentCatalogJournalVersion() >= FeMetaVersion.VERSION_37) { - collationServer = Text.readString(in); + collationServer = Text.readString(in); + } + if (Catalog.getCurrentCatalogJournalVersion() >= FeMetaVersion.VERSION_38) { + batchSize = in.readInt(); + disableStreamPreaggregations = in.readBoolean(); + mtDop = in.readInt(); } } } diff --git a/fe/src/com/baidu/palo/qe/ShowExecutor.java b/fe/src/com/baidu/palo/qe/ShowExecutor.java index 1394ca5364..9a8d5bb3ac 100644 --- a/fe/src/com/baidu/palo/qe/ShowExecutor.java +++ b/fe/src/com/baidu/palo/qe/ShowExecutor.java @@ -15,15 +15,6 @@ package com.baidu.palo.qe; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; - import com.baidu.palo.analysis.DescribeStmt; import com.baidu.palo.analysis.HelpStmt; import com.baidu.palo.analysis.ShowAlterStmt; @@ -87,11 +78,21 @@ import com.baidu.palo.load.LoadErrorHub; import com.baidu.palo.load.LoadErrorHub.HubType; import com.baidu.palo.load.LoadJob; import com.baidu.palo.load.LoadJob.JobState; + import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.collect.Sets; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; + // Execute one show statement. public class ShowExecutor { private static final Logger LOG = LogManager.getLogger(ShowExecutor.class); @@ -802,10 +803,12 @@ public class ShowExecutor { } } while (false); + String detailCmd = String.format("SHOW PROC '/dbs/%d/%d/partitions/%d/%d/%d/';", + dbId, tableId, partitionId, indexId, tabletId); rows.add(Lists.newArrayList(dbName, tableName, partitionName, indexName, dbId.toString(), tableId.toString(), partitionId.toString(), indexId.toString(), - isSync.toString())); + isSync.toString(), detailCmd)); } else { Database db = catalog.getDb(showStmt.getDbName()); if (db == null) { diff --git a/fe/src/com/baidu/palo/qe/StmtExecutor.java b/fe/src/com/baidu/palo/qe/StmtExecutor.java index 7c1ef6dcee..a444bd8f09 100644 --- a/fe/src/com/baidu/palo/qe/StmtExecutor.java +++ b/fe/src/com/baidu/palo/qe/StmtExecutor.java @@ -45,6 +45,7 @@ import com.baidu.palo.common.DdlException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.NotImplementedException; import com.baidu.palo.common.util.DebugUtil; import com.baidu.palo.common.util.ProfileManager; import com.baidu.palo.common.util.RuntimeProfile; @@ -298,7 +299,8 @@ public class StmtExecutor { // Analyze one statement to structure in memory. - private void analyze() throws AnalysisException { + private void analyze() throws AnalysisException, InternalException, + NotImplementedException { LOG.info("the originStmt is ={}", originStmt); // Parse statement with parser generated by CUP&FLEX SqlScanner input = new SqlScanner(new StringReader(originStmt)); @@ -354,6 +356,12 @@ public class StmtExecutor { lock(dbs); try { parsedStmt.analyze(analyzer); + // TODO chenhao16, InsertStmt's QueryStmt rewrite + StatementBase originStmt = null; + if (parsedStmt instanceof InsertStmt) { + originStmt = parsedStmt; + parsedStmt = ((InsertStmt) parsedStmt).getQueryStmt(); + } if (parsedStmt instanceof QueryStmt) { QueryStmt queryStmt1 = (QueryStmt)parsedStmt; boolean isExplain = ((QueryStmt) parsedStmt).isExplain(); @@ -394,10 +402,14 @@ public class StmtExecutor { if (isExplain) parsedStmt.setIsExplain(isExplain); } } + + if (originStmt != null && originStmt instanceof InsertStmt) { + parsedStmt = originStmt; + } // create plan planner = new Planner(); if (parsedStmt instanceof QueryStmt || parsedStmt instanceof InsertStmt) { - planner.plan(parsedStmt, analyzer, new TQueryOptions()); + planner.plan(parsedStmt, analyzer, context.getSessionVariable().toThrift()); } else { planner.plan(((CreateTableAsSelectStmt) parsedStmt).getInsertStmt(), analyzer, new TQueryOptions()); @@ -407,7 +419,9 @@ public class StmtExecutor { } catch (AnalysisException e) { throw e; } catch (InternalException e) { - throw new AnalysisException(e.getMessage()); + throw e; + } catch (NotImplementedException e) { + throw e; } catch (Exception e) { LOG.warn("Analyze failed because ", e); throw new AnalysisException("Internal Error, maybe this is a bug, please contact with Palo RD."); diff --git a/fe/src/com/baidu/palo/qe/VariableMgr.java b/fe/src/com/baidu/palo/qe/VariableMgr.java index 965bfc2403..eda0842f34 100644 --- a/fe/src/com/baidu/palo/qe/VariableMgr.java +++ b/fe/src/com/baidu/palo/qe/VariableMgr.java @@ -18,23 +18,26 @@ package com.baidu.palo.qe; import com.baidu.palo.analysis.SetType; import com.baidu.palo.analysis.SetVar; import com.baidu.palo.analysis.SysVariableDesc; -import com.baidu.palo.catalog.PrimitiveType; +import com.baidu.palo.catalog.Catalog; import com.baidu.palo.catalog.Type; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.DdlException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; import com.baidu.palo.common.PatternMatcher; -import com.baidu.palo.catalog.Catalog; import com.baidu.palo.persist.EditLog; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSortedMap; import com.google.common.collect.Lists; -import org.apache.commons.lang.SerializationUtils; -import org.apache.logging.log4j.Logger; -import org.apache.logging.log4j.LogManager; +import org.apache.commons.lang.SerializationUtils; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.reflect.Field; @@ -46,9 +49,6 @@ import java.util.Map; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; -import java.io.DataInputStream; -import java.io.DataOutputStream; -import java.io.IOException; // Variable manager, merge session variable and global variable public class VariableMgr { @@ -244,8 +244,11 @@ public class VariableMgr { public static void read(DataInputStream in) throws IOException, DdlException { wlock.lock(); - globalSessionVariable.readFields(in); - wlock.unlock(); + try { + globalSessionVariable.readFields(in); + } finally { + wlock.unlock(); + } } private static void writeGlobalVariableUpdate(SessionVariable variable, String msg) { diff --git a/fe/src/com/baidu/palo/service/FrontendOptions.java b/fe/src/com/baidu/palo/service/FrontendOptions.java index d49216a3ab..2df70f34d2 100644 --- a/fe/src/com/baidu/palo/service/FrontendOptions.java +++ b/fe/src/com/baidu/palo/service/FrontendOptions.java @@ -93,6 +93,10 @@ public class FrontendOptions { return localAddr.getHostAddress(); } + public static String getHostname() { + return localAddr.getHostName(); + } + private static void analyzePriorityCidrs() { String prior_cidrs = Config.priority_networks; if (Strings.isNullOrEmpty(prior_cidrs)) { diff --git a/fe/src/com/baidu/palo/system/Backend.java b/fe/src/com/baidu/palo/system/Backend.java index efb61e3bb2..6b85838592 100644 --- a/fe/src/com/baidu/palo/system/Backend.java +++ b/fe/src/com/baidu/palo/system/Backend.java @@ -13,418 +13,438 @@ // specific language governing permissions and limitations // under the License. -package com.baidu.palo.system; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.concurrent.atomic.AtomicBoolean; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.concurrent.atomic.AtomicLong; -import java.util.concurrent.atomic.AtomicReference; - -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; - -import com.baidu.palo.alter.DecommissionBackendJob.DecommissionType; -import com.baidu.palo.catalog.Catalog; -import com.baidu.palo.catalog.DiskInfo; -import com.baidu.palo.catalog.DiskInfo.DiskState; -import com.baidu.palo.common.FeMetaVersion; -import com.baidu.palo.common.io.Text; -import com.baidu.palo.common.io.Writable; -import com.baidu.palo.system.BackendEvent.BackendEventType; -import com.baidu.palo.thrift.TDisk; -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.Maps; -import com.google.common.eventbus.EventBus; - -/** - * This class extends the primary identifier of a Backend with ephemeral state, - * eg usage information, current administrative state etc. - */ -public class Backend implements Writable { - - public enum BackendState { +package com.baidu.palo.system; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.atomic.AtomicReference; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import com.baidu.palo.alter.DecommissionBackendJob.DecommissionType; +import com.baidu.palo.catalog.Catalog; +import com.baidu.palo.catalog.DiskInfo; +import com.baidu.palo.catalog.DiskInfo.DiskState; +import com.baidu.palo.common.FeMetaVersion; +import com.baidu.palo.common.io.Text; +import com.baidu.palo.common.io.Writable; +import com.baidu.palo.system.BackendEvent.BackendEventType; +import com.baidu.palo.thrift.TDisk; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Maps; +import com.google.common.eventbus.EventBus; + +/** + * This class extends the primary identifier of a Backend with ephemeral state, + * eg usage information, current administrative state etc. + */ +public class Backend implements Writable { + + public enum BackendState { using, /* backend is belong to a cluster*/ offline, - free /* backend is not belong to any clusters */ - } - - private static final Logger LOG = LogManager.getLogger(Backend.class); - - private long id; - private String host; - - private int heartbeatPort; // heartbeat - private AtomicInteger bePort; // be - private AtomicInteger httpPort; // web service - private AtomicInteger beRpcPort; // be rpc port - - private AtomicLong lastUpdateMs; - private AtomicLong lastStartTime; - private AtomicBoolean isAlive; - - private AtomicBoolean isDecommissioned; - private AtomicInteger decommissionType; - private AtomicReference ownerClusterName; - // to index the state in some cluster - private AtomicInteger backendState; - // private BackendState backendState; - - // rootPath -> DiskInfo - private AtomicReference> disksRef; - - public Backend() { - this.host = ""; - this.lastUpdateMs = new AtomicLong(); - this.lastStartTime = new AtomicLong(); - this.isAlive = new AtomicBoolean(); - this.isDecommissioned = new AtomicBoolean(false); - - this.bePort = new AtomicInteger(); - this.httpPort = new AtomicInteger(); - this.beRpcPort = new AtomicInteger(); - this.disksRef = new AtomicReference>(ImmutableMap. of()); - - this.ownerClusterName = new AtomicReference(""); - this.backendState = new AtomicInteger(BackendState.free.ordinal()); - - this.decommissionType = new AtomicInteger(DecommissionType.SystemDecommission.ordinal()); - } - - public Backend(long id, String host, int heartbeatPort) { - this.id = id; - this.host = host; - this.heartbeatPort = heartbeatPort; - this.bePort = new AtomicInteger(-1); - this.httpPort = new AtomicInteger(-1); - this.beRpcPort = new AtomicInteger(-1); - this.lastUpdateMs = new AtomicLong(-1L); - this.lastStartTime = new AtomicLong(-1L); - this.disksRef = new AtomicReference>(ImmutableMap. of()); - - this.isAlive = new AtomicBoolean(false); - this.isDecommissioned = new AtomicBoolean(false); - - this.ownerClusterName = new AtomicReference(""); - this.backendState = new AtomicInteger(BackendState.free.ordinal()); - this.decommissionType = new AtomicInteger(DecommissionType.SystemDecommission.ordinal()); - } - - public long getId() { - return id; - } - - public String getHost() { - return host; - } - - public int getBePort() { - return bePort.get(); - } - - public int getHeartbeatPort() { - return heartbeatPort; - } - - public int getHttpPort() { - return httpPort.get(); - } - - public int getBeRpcPort() { - return beRpcPort.get(); - } - - public void updateOnce(int bePort, int httpPort, int beRpcPort) { - boolean isChanged = false; - if (this.bePort.get() != bePort) { - isChanged = true; - this.bePort.set(bePort); - } - - if (this.httpPort.get() != httpPort) { - isChanged = true; - this.httpPort.set(httpPort); - } - - if (this.beRpcPort.get() != beRpcPort) { - isChanged = true; - this.beRpcPort.set(beRpcPort); - } - - long currentTime = System.currentTimeMillis(); - this.lastUpdateMs.set(currentTime); - if (!isAlive.get()) { - isChanged = true; - this.lastStartTime.set(currentTime); - LOG.info("{} is alive,", this.toString()); - this.isAlive.set(true); - } - - if (isChanged) { - Catalog.getInstance().getEditLog().logBackendStateChange(this); - } - } - - public boolean setDecommissioned(boolean isDecommissioned) { - if (this.isDecommissioned.compareAndSet(!isDecommissioned, isDecommissioned)) { - LOG.warn("{} set decommission: {}", this.toString(), isDecommissioned); - return true; - } - return false; - } - - public void setBad(EventBus eventBus) { - if (isAlive.compareAndSet(true, false)) { - Catalog.getInstance().getEditLog().logBackendStateChange(this); - LOG.warn("{} is dead", this.toString()); - } - - eventBus.post(new BackendEvent(BackendEventType.BACKEND_DOWN, "missing heartbeat", Long.valueOf(id))); - } - - public void setBackendState(BackendState state) { - this.backendState.set(state.ordinal()); - } - - public void setAlive(boolean isAlive) { - this.isAlive.set(isAlive); - } - - public void setBePort(int agentPort) { - this.bePort.set(agentPort); - } - - public void setHttpPort(int httpPort) { - this.httpPort.set(httpPort); - } - - public void setBeRpcPort(int beRpcPort) { - this.beRpcPort.set(beRpcPort); - } - - public long getLastUpdateMs() { - return this.lastUpdateMs.get(); - } - - public void setLastUpdateMs(long currentTime) { - this.lastUpdateMs.set(currentTime); - } - - public long getLastStartTime() { - return this.lastStartTime.get(); - } - - public void setLastStartTime(long currentTime) { - this.lastStartTime.set(currentTime); - } - - public boolean isAlive() { - return this.isAlive.get(); - } - - public boolean isDecommissioned() { - return this.isDecommissioned.get(); - } - - public boolean isAvailable() { - return this.isAlive.get() && !this.isDecommissioned.get(); - } - - public void setDisks(ImmutableMap disks) { - this.disksRef.set(disks); - } - - /** - * backend belong to some cluster - * - * @return - */ - public boolean isUsedByCluster() { - return this.backendState.get() == BackendState.using.ordinal(); - } - - /** - * backend is free, and it isn't belong to any cluster - * - * @return - */ - public boolean isFreeFromCluster() { - return this.backendState.get() == BackendState.free.ordinal(); - } - - /** - * backend execute discommission in cluster , and backendState will be free - * finally - * - * @return - */ - public boolean isOffLineFromCluster() { - return this.backendState.get() == BackendState.offline.ordinal(); - } - - public ImmutableMap getDisks() { - return this.disksRef.get(); - } - - public List getDiskInfosAsString() { - ImmutableMap disks = disksRef.get(); - List diskInfoStrings = new LinkedList(); - for (DiskInfo diskInfo : disks.values()) { - diskInfoStrings.add(diskInfo.getRootPath() + "|" + diskInfo.getTotalCapacityB() + "|" - + diskInfo.getDataUsedCapacityB() + "|" + diskInfo.getAvailableCapacityB() + "|" - + diskInfo.getState().name()); - } - return diskInfoStrings; - } - - public long getTotalCapacityB() { - ImmutableMap disks = disksRef.get(); - long totalCapacityB = 0L; - for (DiskInfo diskInfo : disks.values()) { - if (diskInfo.getState() == DiskState.ONLINE) { - totalCapacityB += diskInfo.getTotalCapacityB(); - } - } - return totalCapacityB; - } - - public long getAvailableCapacityB() { - // when cluster init, disks is empty, return 1L. - ImmutableMap disks = disksRef.get(); - long availableCapacityB = 1L; - for (DiskInfo diskInfo : disks.values()) { - if (diskInfo.getState() == DiskState.ONLINE) { - availableCapacityB += diskInfo.getAvailableCapacityB(); - } - } - return availableCapacityB; - } + free /* backend is not belong to any clusters */ + } + + private static final Logger LOG = LogManager.getLogger(Backend.class); + + private long id; + private String host; + + private int heartbeatPort; // heartbeat + private AtomicInteger bePort; // be + private AtomicInteger httpPort; // web service + private AtomicInteger beRpcPort; // be rpc port + private AtomicInteger brpcPort = new AtomicInteger(-1); + + private AtomicLong lastUpdateMs; + private AtomicLong lastStartTime; + private AtomicBoolean isAlive; + + private AtomicBoolean isDecommissioned; + private AtomicInteger decommissionType; + private AtomicReference ownerClusterName; + // to index the state in some cluster + private AtomicInteger backendState; + // private BackendState backendState; + + // rootPath -> DiskInfo + private AtomicReference> disksRef; + + public Backend() { + this.host = ""; + this.lastUpdateMs = new AtomicLong(); + this.lastStartTime = new AtomicLong(); + this.isAlive = new AtomicBoolean(); + this.isDecommissioned = new AtomicBoolean(false); + + this.bePort = new AtomicInteger(); + this.httpPort = new AtomicInteger(); + this.beRpcPort = new AtomicInteger(); + this.disksRef = new AtomicReference>(ImmutableMap. of()); + + this.ownerClusterName = new AtomicReference(""); + this.backendState = new AtomicInteger(BackendState.free.ordinal()); + + this.decommissionType = new AtomicInteger(DecommissionType.SystemDecommission.ordinal()); + } + + public Backend(long id, String host, int heartbeatPort) { + this.id = id; + this.host = host; + this.heartbeatPort = heartbeatPort; + this.bePort = new AtomicInteger(-1); + this.httpPort = new AtomicInteger(-1); + this.beRpcPort = new AtomicInteger(-1); + this.lastUpdateMs = new AtomicLong(-1L); + this.lastStartTime = new AtomicLong(-1L); + this.disksRef = new AtomicReference>(ImmutableMap. of()); + + this.isAlive = new AtomicBoolean(false); + this.isDecommissioned = new AtomicBoolean(false); + + this.ownerClusterName = new AtomicReference(""); + this.backendState = new AtomicInteger(BackendState.free.ordinal()); + this.decommissionType = new AtomicInteger(DecommissionType.SystemDecommission.ordinal()); + } + + public long getId() { + return id; + } + + public String getHost() { + return host; + } + + public int getBePort() { + return bePort.get(); + } + + public int getHeartbeatPort() { + return heartbeatPort; + } + + public int getHttpPort() { + return httpPort.get(); + } + + public int getBeRpcPort() { + return beRpcPort.get(); + } + + public int getBrpcPort() { + return brpcPort.get(); + } + + // back compatible with unit test + public void updateOnce(int bePort, int httpPort, int beRpcPort) { + updateOnce(bePort, httpPort, beRpcPort, -1); + } + + public void updateOnce(int bePort, int httpPort, int beRpcPort, int brpcPort) { + boolean isChanged = false; + if (this.bePort.get() != bePort) { + isChanged = true; + this.bePort.set(bePort); + } + + if (this.httpPort.get() != httpPort) { + isChanged = true; + this.httpPort.set(httpPort); + } + + if (this.beRpcPort.get() != beRpcPort) { + isChanged = true; + this.beRpcPort.set(beRpcPort); + } + + if (this.brpcPort.get() != brpcPort) { + isChanged = true; + this.brpcPort.set(brpcPort); + } + + long currentTime = System.currentTimeMillis(); + this.lastUpdateMs.set(currentTime); + if (!isAlive.get()) { + isChanged = true; + this.lastStartTime.set(currentTime); + LOG.info("{} is alive,", this.toString()); + this.isAlive.set(true); + } + + if (isChanged) { + Catalog.getInstance().getEditLog().logBackendStateChange(this); + } + } + + public boolean setDecommissioned(boolean isDecommissioned) { + if (this.isDecommissioned.compareAndSet(!isDecommissioned, isDecommissioned)) { + LOG.warn("{} set decommission: {}", this.toString(), isDecommissioned); + return true; + } + return false; + } + + public void setBad(EventBus eventBus) { + if (isAlive.compareAndSet(true, false)) { + Catalog.getInstance().getEditLog().logBackendStateChange(this); + LOG.warn("{} is dead", this.toString()); + } + + eventBus.post(new BackendEvent(BackendEventType.BACKEND_DOWN, "missing heartbeat", Long.valueOf(id))); + } + + public void setBackendState(BackendState state) { + this.backendState.set(state.ordinal()); + } + + public void setAlive(boolean isAlive) { + this.isAlive.set(isAlive); + } + + public void setBePort(int agentPort) { + this.bePort.set(agentPort); + } + + public void setHttpPort(int httpPort) { + this.httpPort.set(httpPort); + } + + public void setBeRpcPort(int beRpcPort) { + this.beRpcPort.set(beRpcPort); + } + + public void setBrpcPort(int brpcPort) { + this.brpcPort.set(brpcPort); + } + + public long getLastUpdateMs() { + return this.lastUpdateMs.get(); + } + + public void setLastUpdateMs(long currentTime) { + this.lastUpdateMs.set(currentTime); + } + + public long getLastStartTime() { + return this.lastStartTime.get(); + } + + public void setLastStartTime(long currentTime) { + this.lastStartTime.set(currentTime); + } + + public boolean isAlive() { + return this.isAlive.get(); + } + + public boolean isDecommissioned() { + return this.isDecommissioned.get(); + } + + public boolean isAvailable() { + return this.isAlive.get() && !this.isDecommissioned.get(); + } + + public void setDisks(ImmutableMap disks) { + this.disksRef.set(disks); + } + + /** + * backend belong to some cluster + * + * @return + */ + public boolean isUsedByCluster() { + return this.backendState.get() == BackendState.using.ordinal(); + } + + /** + * backend is free, and it isn't belong to any cluster + * + * @return + */ + public boolean isFreeFromCluster() { + return this.backendState.get() == BackendState.free.ordinal(); + } + + /** + * backend execute discommission in cluster , and backendState will be free + * finally + * + * @return + */ + public boolean isOffLineFromCluster() { + return this.backendState.get() == BackendState.offline.ordinal(); + } + + public ImmutableMap getDisks() { + return this.disksRef.get(); + } + + public List getDiskInfosAsString() { + ImmutableMap disks = disksRef.get(); + List diskInfoStrings = new LinkedList(); + for (DiskInfo diskInfo : disks.values()) { + diskInfoStrings.add(diskInfo.getRootPath() + "|" + diskInfo.getTotalCapacityB() + "|" + + diskInfo.getDataUsedCapacityB() + "|" + diskInfo.getAvailableCapacityB() + "|" + + diskInfo.getState().name()); + } + return diskInfoStrings; + } + + public long getTotalCapacityB() { + ImmutableMap disks = disksRef.get(); + long totalCapacityB = 0L; + for (DiskInfo diskInfo : disks.values()) { + if (diskInfo.getState() == DiskState.ONLINE) { + totalCapacityB += diskInfo.getTotalCapacityB(); + } + } + return totalCapacityB; + } + + public long getAvailableCapacityB() { + // when cluster init, disks is empty, return 1L. + ImmutableMap disks = disksRef.get(); + long availableCapacityB = 1L; + for (DiskInfo diskInfo : disks.values()) { + if (diskInfo.getState() == DiskState.ONLINE) { + availableCapacityB += diskInfo.getAvailableCapacityB(); + } + } + return availableCapacityB; + } + + public long getDataUsedCapacityB() { + ImmutableMap disks = disksRef.get(); + long dataUsedCapacityB = 0L; + for (DiskInfo diskInfo : disks.values()) { + if (diskInfo.getState() == DiskState.ONLINE) { + dataUsedCapacityB += diskInfo.getDataUsedCapacityB(); + } + } + return dataUsedCapacityB; + } + + public void updateDisks(Map backendDisks) { + // update status or add new diskInfo + ImmutableMap disks = disksRef.get(); + Map newDisks = Maps.newHashMap(); + for (TDisk tDisk : backendDisks.values()) { + String rootPath = tDisk.getRoot_path(); + long totalCapacityB = tDisk.getDisk_total_capacity(); + long dataUsedCapacityB = tDisk.getData_used_capacity(); + long diskAvailableCapacityB = tDisk.getDisk_available_capacity(); + boolean isUsed = tDisk.isUsed(); + + DiskInfo diskInfo = disks.get(rootPath); + if (diskInfo == null) { + diskInfo = new DiskInfo(rootPath); + LOG.info("add new disk info. backendId: {}, rootPath: {}", id, rootPath); + } + newDisks.put(rootPath, diskInfo); + + diskInfo.setTotalCapacityB(totalCapacityB); + diskInfo.setDataUsedCapacityB(dataUsedCapacityB); + diskInfo.setAvailableCapacityB(diskAvailableCapacityB); + if (isUsed) { + diskInfo.setState(DiskState.ONLINE); + } else { + diskInfo.setState(DiskState.OFFLINE); + } + LOG.debug("update disk info. backendId: {}, diskInfo: {}", id, diskInfo.toString()); + } + + // remove not exist rootPath in backend + // no remove op. just log + for (DiskInfo diskInfo : disks.values()) { + String rootPath = diskInfo.getRootPath(); + if (!backendDisks.containsKey(rootPath)) { + LOG.warn("remove not exist rootPath. backendId: {}, rootPath: {}", id, rootPath); + } + } + + // update disksRef + disksRef.set(ImmutableMap.copyOf(newDisks)); + + // log disk changing + Catalog.getInstance().getEditLog().logBackendStateChange(this); + } + + public static Backend read(DataInput in) throws IOException { + Backend backend = new Backend(); + backend.readFields(in); + return backend; + } + + @Override + public void write(DataOutput out) throws IOException { + out.writeLong(id); + Text.writeString(out, host); + out.writeInt(heartbeatPort); + out.writeInt(bePort.get()); + out.writeInt(httpPort.get()); + out.writeInt(beRpcPort.get()); + out.writeBoolean(isAlive.get()); + out.writeBoolean(isDecommissioned.get()); + out.writeLong(lastUpdateMs.get()); + + out.writeLong(lastStartTime.get()); + + ImmutableMap disks = disksRef.get(); + out.writeInt(disks.size()); + for (Map.Entry entry : disks.entrySet()) { + Text.writeString(out, entry.getKey()); + entry.getValue().write(out); + } - public long getDataUsedCapacityB() { - ImmutableMap disks = disksRef.get(); - long dataUsedCapacityB = 0L; - for (DiskInfo diskInfo : disks.values()) { - if (diskInfo.getState() == DiskState.ONLINE) { - dataUsedCapacityB += diskInfo.getDataUsedCapacityB(); - } - } - return dataUsedCapacityB; - } - - public void updateDisks(Map backendDisks) { - // update status or add new diskInfo - ImmutableMap disks = disksRef.get(); - Map newDisks = Maps.newHashMap(); - for (TDisk tDisk : backendDisks.values()) { - String rootPath = tDisk.getRoot_path(); - long totalCapacityB = tDisk.getDisk_total_capacity(); - long dataUsedCapacityB = tDisk.getData_used_capacity(); - long diskAvailableCapacityB = tDisk.getDisk_available_capacity(); - boolean isUsed = tDisk.isUsed(); - - DiskInfo diskInfo = disks.get(rootPath); - if (diskInfo == null) { - diskInfo = new DiskInfo(rootPath); - LOG.info("add new disk info. backendId: {}, rootPath: {}", id, rootPath); - } - newDisks.put(rootPath, diskInfo); - - diskInfo.setTotalCapacityB(totalCapacityB); - diskInfo.setDataUsedCapacityB(dataUsedCapacityB); - diskInfo.setAvailableCapacityB(diskAvailableCapacityB); - if (isUsed) { - diskInfo.setState(DiskState.ONLINE); - } else { - diskInfo.setState(DiskState.OFFLINE); - } - LOG.debug("update disk info. backendId: {}, diskInfo: {}", id, diskInfo.toString()); - } - - // remove not exist rootPath in backend - // no remove op. just log - for (DiskInfo diskInfo : disks.values()) { - String rootPath = diskInfo.getRootPath(); - if (!backendDisks.containsKey(rootPath)) { - LOG.warn("remove not exist rootPath. backendId: {}, rootPath: {}", id, rootPath); - } - } - - // update disksRef - disksRef.set(ImmutableMap.copyOf(newDisks)); - - // log disk changing - Catalog.getInstance().getEditLog().logBackendStateChange(this); - } - - public static Backend read(DataInput in) throws IOException { - Backend backend = new Backend(); - backend.readFields(in); - return backend; - } - - @Override - public void write(DataOutput out) throws IOException { - out.writeLong(id); - Text.writeString(out, host); - out.writeInt(heartbeatPort); - out.writeInt(bePort.get()); - out.writeInt(httpPort.get()); - out.writeInt(beRpcPort.get()); - out.writeBoolean(isAlive.get()); - out.writeBoolean(isDecommissioned.get()); - out.writeLong(lastUpdateMs.get()); - - out.writeLong(lastStartTime.get()); - - ImmutableMap disks = disksRef.get(); - out.writeInt(disks.size()); - for (Map.Entry entry : disks.entrySet()) { - Text.writeString(out, entry.getKey()); - entry.getValue().write(out); - } - Text.writeString(out, ownerClusterName.get()); - out.writeInt(backendState.get()); - out.writeInt(decommissionType.get()); - - } - - @Override - public void readFields(DataInput in) throws IOException { - id = in.readLong(); - host = Text.readString(in); - heartbeatPort = in.readInt(); - bePort.set(in.readInt()); - httpPort.set(in.readInt()); - if (Catalog.getCurrentCatalogJournalVersion() >= FeMetaVersion.VERSION_31) { - beRpcPort.set(in.readInt()); - } - isAlive.set(in.readBoolean()); - - if (Catalog.getCurrentCatalogJournalVersion() >= 5) { - isDecommissioned.set(in.readBoolean()); - } - - lastUpdateMs.set(in.readLong()); - - if (Catalog.getCurrentCatalogJournalVersion() >= 2) { - lastStartTime.set(in.readLong()); - - Map disks = Maps.newHashMap(); - int size = in.readInt(); - for (int i = 0; i < size; i++) { - String rootPath = Text.readString(in); - DiskInfo diskInfo = DiskInfo.read(in); - disks.put(rootPath, diskInfo); - } - - disksRef.set(ImmutableMap.copyOf(disks)); - } + out.writeInt(backendState.get()); + out.writeInt(decommissionType.get()); + + out.writeInt(brpcPort.get()); + } + + @Override + public void readFields(DataInput in) throws IOException { + id = in.readLong(); + host = Text.readString(in); + heartbeatPort = in.readInt(); + bePort.set(in.readInt()); + httpPort.set(in.readInt()); + if (Catalog.getCurrentCatalogJournalVersion() >= FeMetaVersion.VERSION_31) { + beRpcPort.set(in.readInt()); + } + isAlive.set(in.readBoolean()); + + if (Catalog.getCurrentCatalogJournalVersion() >= 5) { + isDecommissioned.set(in.readBoolean()); + } + + lastUpdateMs.set(in.readLong()); + + if (Catalog.getCurrentCatalogJournalVersion() >= 2) { + lastStartTime.set(in.readLong()); + + Map disks = Maps.newHashMap(); + int size = in.readInt(); + for (int i = 0; i < size; i++) { + String rootPath = Text.readString(in); + DiskInfo diskInfo = DiskInfo.read(in); + disks.put(rootPath, diskInfo); + } + + disksRef.set(ImmutableMap.copyOf(disks)); + } if (Catalog.getCurrentCatalogJournalVersion() >= FeMetaVersion.VERSION_30) { ownerClusterName.set(Text.readString(in)); backendState.set(in.readInt()); @@ -434,29 +454,33 @@ public class Backend implements Writable { backendState.set(BackendState.using.ordinal()); decommissionType.set(DecommissionType.SystemDecommission.ordinal()); } - } - - @Override - public boolean equals(Object obj) { - if (this == obj) { - return true; - } - if (!(obj instanceof Backend)) { - return false; - } - - Backend backend = (Backend) obj; - - return (id == backend.id) && (host.equals(backend.host)) && (heartbeatPort == backend.heartbeatPort) - && (bePort.get() == backend.bePort.get()) && (isAlive.get() == backend.isAlive.get()); - } - - @Override - public String toString() { - return "Backend [id=" + id + ", host=" + host + ", heartbeatPort=" + heartbeatPort + ", alive=" + isAlive.get() - + "]"; - } - + + if (Catalog.getCurrentCatalogJournalVersion() >= FeMetaVersion.VERSION_40) { + brpcPort.set(in.readInt()); + } + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (!(obj instanceof Backend)) { + return false; + } + + Backend backend = (Backend) obj; + + return (id == backend.id) && (host.equals(backend.host)) && (heartbeatPort == backend.heartbeatPort) + && (bePort.get() == backend.bePort.get()) && (isAlive.get() == backend.isAlive.get()); + } + + @Override + public String toString() { + return "Backend [id=" + id + ", host=" + host + ", heartbeatPort=" + heartbeatPort + ", alive=" + isAlive.get() + + "]"; + } + public String getOwnerClusterName() { return ownerClusterName.get(); } @@ -469,28 +493,28 @@ public class Backend implements Writable { ownerClusterName.set(""); } - public BackendState getBackendState() { - switch (backendState.get()) { - case 0: - return BackendState.using; - case 1: - return BackendState.offline; - case 2: - return BackendState.free; - default: - return BackendState.free; - } - } - - public void setDecommissionType(DecommissionType type) { - decommissionType.set(type.ordinal()); - } - - public DecommissionType getDecommissionType() { - if (decommissionType.get() == DecommissionType.ClusterDecommission.ordinal()) { - return DecommissionType.ClusterDecommission; - } - return DecommissionType.SystemDecommission; - } + public BackendState getBackendState() { + switch (backendState.get()) { + case 0: + return BackendState.using; + case 1: + return BackendState.offline; + case 2: + return BackendState.free; + default: + return BackendState.free; + } + } -} + public void setDecommissionType(DecommissionType type) { + decommissionType.set(type.ordinal()); + } + + public DecommissionType getDecommissionType() { + if (decommissionType.get() == DecommissionType.ClusterDecommission.ordinal()) { + return DecommissionType.ClusterDecommission; + } + return DecommissionType.SystemDecommission; + } + +} diff --git a/fe/src/com/baidu/palo/system/SystemInfoService.java b/fe/src/com/baidu/palo/system/SystemInfoService.java index 0c5878929a..0797136daf 100644 --- a/fe/src/com/baidu/palo/system/SystemInfoService.java +++ b/fe/src/com/baidu/palo/system/SystemInfoService.java @@ -1094,6 +1094,7 @@ public class SystemInfoService extends Daemon { memoryBe.setDecommissioned(be.isDecommissioned()); memoryBe.setHttpPort(be.getHttpPort()); memoryBe.setBeRpcPort(be.getBeRpcPort()); + memoryBe.setBrpcPort(be.getBrpcPort()); memoryBe.setLastUpdateMs(be.getLastUpdateMs()); memoryBe.setLastStartTime(be.getLastStartTime()); memoryBe.setDisks(be.getDisks()); @@ -1157,7 +1158,11 @@ public class SystemInfoService extends Daemon { int bePort = tBackendInfo.getBe_port(); int httpPort = tBackendInfo.getHttp_port(); int beRpcPort = tBackendInfo.getBe_rpc_port(); - backend.updateOnce(bePort, httpPort, beRpcPort); + int brpcPort = -1; + if (tBackendInfo.isSetBrpc_port()) { + brpcPort = tBackendInfo.getBrpc_port(); + } + backend.updateOnce(bePort, httpPort, beRpcPort, brpcPort); } else { LOG.warn("failed to heartbeat backend[" + backendId + "]: " + result.getStatus().toString()); backend.setBad(eventBus); diff --git a/fe/test/com/baidu/palo/bdb/BDBToolOptionsTest.java b/fe/test/com/baidu/palo/bdb/BDBToolOptionsTest.java new file mode 100644 index 0000000000..f341c5dbac --- /dev/null +++ b/fe/test/com/baidu/palo/bdb/BDBToolOptionsTest.java @@ -0,0 +1,24 @@ +package com.baidu.palo.bdb; + +import com.baidu.palo.common.FeConstants; +import com.baidu.palo.journal.bdbje.BDBToolOptions; + +import org.junit.Assert; +import org.junit.Test; + +public class BDBToolOptionsTest { + + @Test + public void test() { + BDBToolOptions options = new BDBToolOptions(true, "", false, "", "", 0); + Assert.assertFalse(options.hasFromKey()); + Assert.assertFalse(options.hasEndKey()); + Assert.assertEquals(FeConstants.meta_version, options.getMetaVersion()); + + options = new BDBToolOptions(false, "12345", false, "12345", "12456", 35); + Assert.assertTrue(options.hasFromKey()); + Assert.assertTrue(options.hasEndKey()); + Assert.assertNotEquals(FeConstants.meta_version, options.getMetaVersion()); + } + +} diff --git a/fe/test/com/baidu/palo/bdb/BDBToolTest.java b/fe/test/com/baidu/palo/bdb/BDBToolTest.java new file mode 100644 index 0000000000..ee0b8ed3aa --- /dev/null +++ b/fe/test/com/baidu/palo/bdb/BDBToolTest.java @@ -0,0 +1,149 @@ +package com.baidu.palo.bdb; + +import com.baidu.palo.common.io.DataOutputBuffer; +import com.baidu.palo.journal.JournalEntity; +import com.baidu.palo.journal.bdbje.BDBTool; +import com.baidu.palo.journal.bdbje.BDBToolOptions; +import com.baidu.palo.persist.OperationType; +import com.baidu.palo.persist.ReplicaPersistInfo; + +import com.sleepycat.bind.tuple.TupleBinding; +import com.sleepycat.je.Database; +import com.sleepycat.je.DatabaseConfig; +import com.sleepycat.je.DatabaseEntry; +import com.sleepycat.je.DatabaseException; +import com.sleepycat.je.Environment; +import com.sleepycat.je.EnvironmentConfig; +import com.sleepycat.je.OperationStatus; + +import org.junit.AfterClass; +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; + +import java.io.File; +import java.io.IOException; + +public class BDBToolTest { + + private static Environment env; + private static String path = "./bdb"; + private static Database db; + private static String dbName = "12345"; + + @BeforeClass + public static void setEnv() { + try { + File file = new File("./bdb"); + file.deleteOnExit(); + file.mkdir(); + + // init env + EnvironmentConfig envConfig = new EnvironmentConfig(); + envConfig.setAllowCreate(true); + try { + env = new Environment(new File(path), envConfig); + } catch (DatabaseException e) { + e.printStackTrace(); + } + + // create db + DatabaseConfig dbConfig = new DatabaseConfig(); + dbConfig.setAllowCreate(true); + try { + db = env.openDatabase(null, dbName, dbConfig); + } catch (DatabaseException e) { + e.printStackTrace(); + } + + // write something + ReplicaPersistInfo info = ReplicaPersistInfo.createForAdd(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11); + JournalEntity entity = new JournalEntity(); + entity.setOpCode(OperationType.OP_ADD_REPLICA); + entity.setData(info); + + // id is the key + Long journalId = 23456L; + DatabaseEntry theKey = new DatabaseEntry(); + TupleBinding idBinding = TupleBinding.getPrimitiveBinding(Long.class); + idBinding.objectToEntry(journalId, theKey); + + // entity is the value + DataOutputBuffer buffer = new DataOutputBuffer(128); + try { + entity.write(buffer); + } catch (IOException e) { + e.printStackTrace(); + } + DatabaseEntry theData = new DatabaseEntry(buffer.getData()); + if (db.put(null, theKey, theData) == OperationStatus.SUCCESS) { + System.out.println("successfully writing the key: " + journalId); + } + + try { + if (db != null) { + db.close(); + } + if (env != null) { + env.cleanLog(); + env.close(); + } + } catch (DatabaseException e) { + e.printStackTrace(); + } + + } catch (Exception e) { + e.printStackTrace(); + } + } + + @AfterClass + public static void deleteEnv() { + File file = new File(path); + if (file.isDirectory()) { + String[] fileNames = file.list(); + for (int i = 0; i < fileNames.length; i++) { + File file2 = new File(path + "/" + fileNames[i]); + file2.delete(); + } + } + file.delete(); + System.out.println("file is deleted"); + } + + @Test + public void testList() { + BDBToolOptions options = new BDBToolOptions(true, "", false, "", "", 0); + BDBTool tool = new BDBTool(path, options); + Assert.assertTrue(tool.run()); + } + + @Test + public void testDbStat() { + // wrong db name + BDBToolOptions options = new BDBToolOptions(false, "12346", true, "", "", 0); + BDBTool tool = new BDBTool(path, options); + Assert.assertFalse(tool.run()); + + // right db name + options = new BDBToolOptions(false, "12345", true, "", "", 0); + tool = new BDBTool(path, options); + Assert.assertTrue(tool.run()); + } + + @Test + public void testGetKey() { + BDBToolOptions options = new BDBToolOptions(false, "12345", false, "", "", 0); + BDBTool tool = new BDBTool(path, options); + Assert.assertTrue(tool.run()); + + options = new BDBToolOptions(false, "12345", false, "23456", "12345", 0); + tool = new BDBTool(path, options); + Assert.assertFalse(tool.run()); + + options = new BDBToolOptions(false, "12345", false, "23456", "", 0); + tool = new BDBTool(path, options); + Assert.assertTrue(tool.run()); + } + +} diff --git a/fe/test/com/baidu/palo/catalog/DomainResolverServerTest.java b/fe/test/com/baidu/palo/catalog/DomainResolverServerTest.java index 000a23b71a..65291c906c 100644 --- a/fe/test/com/baidu/palo/catalog/DomainResolverServerTest.java +++ b/fe/test/com/baidu/palo/catalog/DomainResolverServerTest.java @@ -17,12 +17,16 @@ package com.baidu.palo.catalog; import java.util.List; +import org.junit.After; import org.junit.Assert; import org.junit.Before; import org.junit.Test; +import org.junit.runner.RunWith; +import org.powermock.modules.junit4.PowerMockRunner; import com.google.common.collect.Lists; +@RunWith(PowerMockRunner.class) public class DomainResolverServerTest { private DomainResolverServer server; private String user = "test"; @@ -140,4 +144,12 @@ public class DomainResolverServerTest { // no exist domain Assert.assertFalse(server.isAvaliableDomain("www.sina.com.cn11sdfqweg")); } + + @After + public void tearDown() throws Exception { + server = null; + user = null; + domainNameList.clear(); + } + } diff --git a/fe/test/com/baidu/palo/common/CommandLineOptionsTest.java b/fe/test/com/baidu/palo/common/CommandLineOptionsTest.java new file mode 100644 index 0000000000..f54c998149 --- /dev/null +++ b/fe/test/com/baidu/palo/common/CommandLineOptionsTest.java @@ -0,0 +1,21 @@ +package com.baidu.palo.common; + +import com.baidu.palo.journal.bdbje.BDBToolOptions; + +import org.junit.Assert; +import org.junit.Test; + +public class CommandLineOptionsTest { + + @Test + public void test() { + CommandLineOptions options = new CommandLineOptions(true, "", null); + Assert.assertTrue(options.isVersion()); + Assert.assertFalse(options.runBdbTools()); + + options = new CommandLineOptions(false, "", new BDBToolOptions(true, "", false, "", "", 0)); + Assert.assertFalse(options.isVersion()); + Assert.assertTrue(options.runBdbTools()); + } + +} diff --git a/gensrc/parser/sql_parser.y b/gensrc/parser/sql_parser.y index 75f12c4467..9af0e57794 100644 --- a/gensrc/parser/sql_parser.y +++ b/gensrc/parser/sql_parser.y @@ -643,7 +643,7 @@ alter_table_clause ::= {: RESULT = new ReorderColumnsClause(cols, rollup, properties); :} - | opt_properties:properties + | KW_SET LPAREN key_value_map:properties RPAREN {: RESULT = new ModifyTablePropertiesClause(properties); :} diff --git a/gensrc/proto/data.proto b/gensrc/proto/data.proto new file mode 100644 index 0000000000..363200399a --- /dev/null +++ b/gensrc/proto/data.proto @@ -0,0 +1,27 @@ +// Copyright (c) 2018, Baidu.com, Inc. All Rights Reserved + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +syntax="proto2"; + +package palo; + +message PRowBatch { + required int32 num_rows = 1; + repeated int32 row_tuples = 2; + repeated int32 tuple_offsets = 3; + required bytes tuple_data = 4; + required bool is_compressed = 5; +}; + diff --git a/gensrc/proto/internal_service.proto b/gensrc/proto/internal_service.proto new file mode 100644 index 0000000000..68e481743c --- /dev/null +++ b/gensrc/proto/internal_service.proto @@ -0,0 +1,49 @@ +// Copyright (c) 2018, Baidu.com, Inc. All Rights Reserved + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +syntax="proto2"; + +package palo; + +import "data.proto"; +import "status.proto"; +import "types.proto"; + +option cc_generic_services = true; + +message PTransmitDataParams { + // non-change member + required PUniqueId finst_id = 1; + required int32 node_id = 2; + required int32 sender_id = 3; + required int32 be_number = 4; + + // different per packet + required bool eos = 5; + optional PRowBatch row_batch = 6; + // if set to true, indicates that no more row batches will be sent + // for this dest_node_id + // Id of this fragment in its role as a sender. + required int64 packet_seq = 7; +}; + +message PTransmitDataResult { + optional PStatus status = 1; +}; + +service PInternalService { + rpc transmit_data(PTransmitDataParams) returns (PTransmitDataResult); +}; + diff --git a/gensrc/proto/status.proto b/gensrc/proto/status.proto new file mode 100644 index 0000000000..79f0f60d0e --- /dev/null +++ b/gensrc/proto/status.proto @@ -0,0 +1,24 @@ +// Copyright (c) 2018, Baidu.com, Inc. All Rights Reserved + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +syntax="proto2"; + +package palo; + +message PStatus { + required int32 status_code = 1; + repeated string error_msgs = 2; +}; + diff --git a/gensrc/proto/types.proto b/gensrc/proto/types.proto new file mode 100644 index 0000000000..10d2f20e46 --- /dev/null +++ b/gensrc/proto/types.proto @@ -0,0 +1,24 @@ +// Copyright (c) 2018, Baidu.com, Inc. All Rights Reserved + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +syntax="proto2"; + +package palo; + +message PUniqueId { + required int64 hi = 1; + required int64 lo = 2; +}; + diff --git a/gensrc/script/gen_build_version.sh b/gensrc/script/gen_build_version.sh index 9021735813..db4dcada1c 100755 --- a/gensrc/script/gen_build_version.sh +++ b/gensrc/script/gen_build_version.sh @@ -26,7 +26,7 @@ # contains the build version based on the git hash or svn revision. ############################################################## -build_version="0.8.0" +build_version="PALO3.3.19-RELEASE" unset LANG unset LC_CTYPE @@ -45,7 +45,7 @@ then echo "PALO_HOME: ${PALO_HOME}" fi -if [[ ${RUN_BE_TEST} == 1 ]]; then +if [[ -z ${PALO_TEST_BINARY_DIR} ]]; then if [ -e ${PALO_HOME}/gensrc/build/java/com/baidu/palo/common/Version.java \ -a -e ${PALO_HOME}/gensrc/build/gen_cpp/version.h ]; then exit diff --git a/gensrc/thrift/HeartbeatService.thrift b/gensrc/thrift/HeartbeatService.thrift index 683154d1fb..b8bb812b1d 100644 --- a/gensrc/thrift/HeartbeatService.thrift +++ b/gensrc/thrift/HeartbeatService.thrift @@ -35,6 +35,7 @@ struct TBackendInfo { 1: required Types.TPort be_port 2: required Types.TPort http_port 3: optional Types.TPort be_rpc_port + 4: optional Types.TPort brpc_port } struct THeartbeatResult { diff --git a/gensrc/thrift/PaloInternalService.thrift b/gensrc/thrift/PaloInternalService.thrift index 9afe2cb790..9db4c31006 100644 --- a/gensrc/thrift/PaloInternalService.thrift +++ b/gensrc/thrift/PaloInternalService.thrift @@ -52,6 +52,11 @@ enum TErrorHubType { NULL_TYPE } +enum TPrefetchMode { + NONE, + HT_BUCKET +} + struct TMysqlErrorHubInfo { 1: required string host; 2: required i32 port; @@ -88,6 +93,34 @@ struct TQueryOptions { // INT64::MAX 17: optional i64 kudu_latest_observed_ts = 9223372036854775807 18: optional TQueryType query_type = TQueryType.SELECT + 19: optional i64 min_reservation = 0 + 20: optional i64 max_reservation = 107374182400 + 21: optional i64 initial_reservation_total_claims = 2147483647 // TODO chenhao + 22: optional i64 buffer_pool_limit = 2147483648 + + // The default spillable buffer size in bytes, which may be overridden by the planner. + // Defaults to 2MB. + 23: optional i64 default_spillable_buffer_size = 2097152; + + // The minimum spillable buffer to use. The planner will not choose a size smaller than + // this. Defaults to 64KB. + 24: optional i64 min_spillable_buffer_size = 65536; + + // The maximum size of row that the query will reserve memory to process. Processing + // rows larger than this may result in a query failure. Defaults to 512KB, e.g. + // enough for a row with 15 32KB strings or many smaller columns. + // + // Different operators handle this option in different ways. E.g. some simply increase + // the size of all their buffers to fit this row size, whereas others may use more + // sophisticated strategies - e.g. reserving a small number of buffers large enough to + // fit maximum-sized rows. + 25: optional i64 max_row_size = 524288; + + // stream preaggregation + 26: optional bool disable_stream_preaggregations = false; + + // multithreaded degree of intra-node parallelism + 27: optional i32 mt_dop = 0; } // A scan range plus the parameters needed to execute that scan. @@ -103,6 +136,7 @@ struct TPlanFragmentDestination { // ... which is being executed on this server 2: required Types.TNetworkAddress server + 3: optional Types.TNetworkAddress brpc_server } // Parameters for a single execution instance of a particular TPlanFragment diff --git a/gensrc/thrift/PlanNodes.thrift b/gensrc/thrift/PlanNodes.thrift index d136192243..e67679977e 100644 --- a/gensrc/thrift/PlanNodes.thrift +++ b/gensrc/thrift/PlanNodes.thrift @@ -18,301 +18,301 @@ // specific language governing permissions and limitations // under the License. -namespace cpp palo -namespace java com.baidu.palo.thrift - -include "Exprs.thrift" -include "Types.thrift" -include "Partitions.thrift" - -enum TPlanNodeType { - OLAP_SCAN_NODE, - MYSQL_SCAN_NODE, - CSV_SCAN_NODE, - SCHEMA_SCAN_NODE, - HASH_JOIN_NODE, - MERGE_JOIN_NODE, - AGGREGATION_NODE, - PRE_AGGREGATION_NODE, - SORT_NODE, - EXCHANGE_NODE, - MERGE_NODE, - SELECT_NODE, - CROSS_JOIN_NODE, - META_SCAN_NODE, - ANALYTIC_EVAL_NODE, - OLAP_REWRITE_NODE, - KUDU_SCAN_NODE - BROKER_SCAN_NODE - EMPTY_SET_NODE - UNION_NODE -} - -// phases of an execution node -enum TExecNodePhase { - PREPARE, - OPEN, - GETNEXT, - CLOSE, - INVALID -} - -// what to do when hitting a debug point (TPaloQueryOptions.DEBUG_ACTION) -enum TDebugAction { - WAIT, - FAIL -} - -struct TKeyRange { - 1: required i64 begin_key - 2: required i64 end_key - 3: required Types.TPrimitiveType column_type - 4: required string column_name -} - -// The information contained in subclasses of ScanNode captured in two separate -// Thrift structs: -// - TScanRange: the data range that's covered by the scan (which varies with the -// particular partition of the plan fragment of which the scan node is a part) -// - T: all other operational parameters that are the same across -// all plan fragments - -struct TPaloScanRange { - 1: required list hosts - 2: required string schema_hash - 3: required string version - 4: required string version_hash - 5: required Types.TTabletId tablet_id - 6: required string db_name - 7: optional list partition_column_ranges - 8: optional string index_name - 9: optional string table_name -} - -enum TFileFormatType { - FORMAT_CSV_PLAIN, - FORMAT_CSV_GZ, - FORMAT_CSV_LZO, - FORMAT_CSV_BZ2, - FORMAT_CSV_LZ4FRAME, - FORMAT_CSV_LZOP -} - -// One broker range information. -struct TBrokerRangeDesc { - 1: required Types.TFileType file_type - 2: required TFileFormatType format_type - 3: required bool splittable; - // Path of this range - 4: required string path - // Offset of this file start - 5: required i64 start_offset; - // Size of this range, if size = -1, this means that will read to then end of file - 6: required i64 size -} - -struct TBrokerScanRangeParams { - 1: required byte column_separator; - 2: required byte line_delimiter; - - // We construct one line in file to a tuple. And each field of line - // correspond to a slot in this tuple. - // src_tuple_id is the tuple id of the input file - 3: required Types.TTupleId src_tuple_id - // src_slot_ids is the slot_ids of the input file - // we use this id to find the slot descriptor - 4: required list src_slot_ids - - // dest_tuple_id is the tuple id that need by scan node - 5: required Types.TTupleId dest_tuple_id - // This is expr that convert the content read from file - // the format that need by the compute layer. - 6: optional map expr_of_dest_slot - - // properties need to access broker. - 7: optional map properties; - - // If partition_ids is set, data that doesn't in this partition will be filtered. - 8: optional list partition_ids -} - -// Broker scan range -struct TBrokerScanRange { - 1: required list ranges - 2: required TBrokerScanRangeParams params - 3: required list broker_addresses -} - -// Specification of an individual data range which is held in its entirety -// by a storage server -struct TScanRange { - // one of these must be set for every TScanRange2 - 4: optional TPaloScanRange palo_scan_range - 5: optional binary kudu_scan_token - 6: optional TBrokerScanRange broker_scan_range -} - -struct TMySQLScanNode { - 1: required Types.TTupleId tuple_id - 2: required string table_name - 3: required list columns - 4: required list filters -} - -struct TBrokerScanNode { - 1: required Types.TTupleId tuple_id - - // Partition info used to process partition select in broker load - 2: optional list partition_exprs - 3: optional list partition_infos +namespace cpp palo +namespace java com.baidu.palo.thrift + +include "Exprs.thrift" +include "Types.thrift" +include "Partitions.thrift" + +enum TPlanNodeType { + OLAP_SCAN_NODE, + MYSQL_SCAN_NODE, + CSV_SCAN_NODE, + SCHEMA_SCAN_NODE, + HASH_JOIN_NODE, + MERGE_JOIN_NODE, + AGGREGATION_NODE, + PRE_AGGREGATION_NODE, + SORT_NODE, + EXCHANGE_NODE, + MERGE_NODE, + SELECT_NODE, + CROSS_JOIN_NODE, + META_SCAN_NODE, + ANALYTIC_EVAL_NODE, + OLAP_REWRITE_NODE, + KUDU_SCAN_NODE + BROKER_SCAN_NODE + EMPTY_SET_NODE + UNION_NODE } -struct TMiniLoadEtlFunction { - 1: required string function_name - 2: required i32 param_column_index -} - -struct TCsvScanNode { - 1: required Types.TTupleId tuple_id - 2: required list file_paths - - 3: optional string column_separator - 4: optional string line_delimiter - - // - 5: optional map column_type_mapping - - // columns specified in load command - 6: optional list columns - // - 7: optional list unspecified_columns - // always string type, and only contain columns which are not specified - 8: optional list default_values - - 9: optional double max_filter_ratio - 10:optional map column_function_mapping -} - -struct TSchemaScanNode { - 1: required Types.TTupleId tuple_id - - 2: required string table_name - 3: optional string db - 4: optional string table - 5: optional string wild - 6: optional string user - 7: optional string ip - 8: optional i32 port - 9: optional i64 thread_id -} - -struct TMetaScanNode { - 1: required Types.TTupleId tuple_id - - 2: required string table_name - 3: optional string db - 4: optional string table - 5: optional string user -} - -struct TOlapScanNode { - 1: required Types.TTupleId tuple_id - 2: required list key_column_name - 3: required list key_column_type - 4: required bool is_preaggregation - 5: optional string sort_column -} -struct TEqJoinCondition { - // left-hand side of " = " - 1: required Exprs.TExpr left; - // right-hand side of " = " - 2: required Exprs.TExpr right; -} - -enum TJoinOp { - INNER_JOIN, - LEFT_OUTER_JOIN, - LEFT_SEMI_JOIN, - RIGHT_OUTER_JOIN, - FULL_OUTER_JOIN, - CROSS_JOIN, - MERGE_JOIN, - - RIGHT_SEMI_JOIN, - LEFT_ANTI_JOIN, - RIGHT_ANTI_JOIN, - - // Similar to LEFT_ANTI_JOIN with special handling for NULLs for the join conjuncts - // on the build side. Those NULLs are considered candidate matches, and therefore could - // be rejected (ANTI-join), based on the other join conjuncts. This is in contrast - // to LEFT_ANTI_JOIN where NULLs are not matches and therefore always returned. - NULL_AWARE_LEFT_ANTI_JOIN -} - -struct THashJoinNode { - 1: required TJoinOp join_op - - // anything from the ON, USING or WHERE clauses that's an equi-join predicate - 2: required list eq_join_conjuncts - - // anything from the ON or USING clauses (but *not* the WHERE clause) that's not an - // equi-join predicate - 3: optional list other_join_conjuncts - 4: optional bool is_push_down - - // If true, this join node can (but may choose not to) generate slot filters - // after constructing the build side that can be applied to the probe side. - 5: optional bool add_probe_filters -} - -struct TMergeJoinNode { - // anything from the ON, USING or WHERE clauses that's an equi-join predicate - 1: required list cmp_conjuncts - - // anything from the ON or USING clauses (but *not* the WHERE clause) that's not an - // equi-join predicate - 2: optional list other_join_conjuncts -} - -enum TAggregationOp { - INVALID, - COUNT, - MAX, - DISTINCT_PC, - DISTINCT_PCSA, - MIN, - SUM, - GROUP_CONCAT, - HLL, - COUNT_DISTINCT, - SUM_DISTINCT, - LEAD, - FIRST_VALUE, - LAST_VALUE, - RANK, - DENSE_RANK, - ROW_NUMBER, - LAG, - HLL_C, -} - -//struct TAggregateFunctionCall { - // The aggregate function to call. -// 1: required Types.TFunction fn - - // The input exprs to this aggregate function -// 2: required list input_exprs - - // If set, this aggregate function udf has varargs and this is the index for the - // first variable argument. -// 3: optional i32 vararg_start_idx -//} - -struct TAggregationNode { - 1: optional list grouping_exprs +// phases of an execution node +enum TExecNodePhase { + PREPARE, + OPEN, + GETNEXT, + CLOSE, + INVALID +} + +// what to do when hitting a debug point (TPaloQueryOptions.DEBUG_ACTION) +enum TDebugAction { + WAIT, + FAIL +} + +struct TKeyRange { + 1: required i64 begin_key + 2: required i64 end_key + 3: required Types.TPrimitiveType column_type + 4: required string column_name +} + +// The information contained in subclasses of ScanNode captured in two separate +// Thrift structs: +// - TScanRange: the data range that's covered by the scan (which varies with the +// particular partition of the plan fragment of which the scan node is a part) +// - T: all other operational parameters that are the same across +// all plan fragments + +struct TPaloScanRange { + 1: required list hosts + 2: required string schema_hash + 3: required string version + 4: required string version_hash + 5: required Types.TTabletId tablet_id + 6: required string db_name + 7: optional list partition_column_ranges + 8: optional string index_name + 9: optional string table_name +} + +enum TFileFormatType { + FORMAT_CSV_PLAIN, + FORMAT_CSV_GZ, + FORMAT_CSV_LZO, + FORMAT_CSV_BZ2, + FORMAT_CSV_LZ4FRAME, + FORMAT_CSV_LZOP +} + +// One broker range information. +struct TBrokerRangeDesc { + 1: required Types.TFileType file_type + 2: required TFileFormatType format_type + 3: required bool splittable; + // Path of this range + 4: required string path + // Offset of this file start + 5: required i64 start_offset; + // Size of this range, if size = -1, this means that will read to then end of file + 6: required i64 size +} + +struct TBrokerScanRangeParams { + 1: required byte column_separator; + 2: required byte line_delimiter; + + // We construct one line in file to a tuple. And each field of line + // correspond to a slot in this tuple. + // src_tuple_id is the tuple id of the input file + 3: required Types.TTupleId src_tuple_id + // src_slot_ids is the slot_ids of the input file + // we use this id to find the slot descriptor + 4: required list src_slot_ids + + // dest_tuple_id is the tuple id that need by scan node + 5: required Types.TTupleId dest_tuple_id + // This is expr that convert the content read from file + // the format that need by the compute layer. + 6: optional map expr_of_dest_slot + + // properties need to access broker. + 7: optional map properties; + + // If partition_ids is set, data that doesn't in this partition will be filtered. + 8: optional list partition_ids +} + +// Broker scan range +struct TBrokerScanRange { + 1: required list ranges + 2: required TBrokerScanRangeParams params + 3: required list broker_addresses +} + +// Specification of an individual data range which is held in its entirety +// by a storage server +struct TScanRange { + // one of these must be set for every TScanRange2 + 4: optional TPaloScanRange palo_scan_range + 5: optional binary kudu_scan_token + 6: optional TBrokerScanRange broker_scan_range +} + +struct TMySQLScanNode { + 1: required Types.TTupleId tuple_id + 2: required string table_name + 3: required list columns + 4: required list filters +} + +struct TBrokerScanNode { + 1: required Types.TTupleId tuple_id + + // Partition info used to process partition select in broker load + 2: optional list partition_exprs + 3: optional list partition_infos +} + +struct TMiniLoadEtlFunction { + 1: required string function_name + 2: required i32 param_column_index +} + +struct TCsvScanNode { + 1: required Types.TTupleId tuple_id + 2: required list file_paths + + 3: optional string column_separator + 4: optional string line_delimiter + + // + 5: optional map column_type_mapping + + // columns specified in load command + 6: optional list columns + // + 7: optional list unspecified_columns + // always string type, and only contain columns which are not specified + 8: optional list default_values + + 9: optional double max_filter_ratio + 10:optional map column_function_mapping +} + +struct TSchemaScanNode { + 1: required Types.TTupleId tuple_id + + 2: required string table_name + 3: optional string db + 4: optional string table + 5: optional string wild + 6: optional string user + 7: optional string ip + 8: optional i32 port + 9: optional i64 thread_id +} + +struct TMetaScanNode { + 1: required Types.TTupleId tuple_id + + 2: required string table_name + 3: optional string db + 4: optional string table + 5: optional string user +} + +struct TOlapScanNode { + 1: required Types.TTupleId tuple_id + 2: required list key_column_name + 3: required list key_column_type + 4: required bool is_preaggregation + 5: optional string sort_column +} +struct TEqJoinCondition { + // left-hand side of " = " + 1: required Exprs.TExpr left; + // right-hand side of " = " + 2: required Exprs.TExpr right; +} + +enum TJoinOp { + INNER_JOIN, + LEFT_OUTER_JOIN, + LEFT_SEMI_JOIN, + RIGHT_OUTER_JOIN, + FULL_OUTER_JOIN, + CROSS_JOIN, + MERGE_JOIN, + + RIGHT_SEMI_JOIN, + LEFT_ANTI_JOIN, + RIGHT_ANTI_JOIN, + + // Similar to LEFT_ANTI_JOIN with special handling for NULLs for the join conjuncts + // on the build side. Those NULLs are considered candidate matches, and therefore could + // be rejected (ANTI-join), based on the other join conjuncts. This is in contrast + // to LEFT_ANTI_JOIN where NULLs are not matches and therefore always returned. + NULL_AWARE_LEFT_ANTI_JOIN +} + +struct THashJoinNode { + 1: required TJoinOp join_op + + // anything from the ON, USING or WHERE clauses that's an equi-join predicate + 2: required list eq_join_conjuncts + + // anything from the ON or USING clauses (but *not* the WHERE clause) that's not an + // equi-join predicate + 3: optional list other_join_conjuncts + 4: optional bool is_push_down + + // If true, this join node can (but may choose not to) generate slot filters + // after constructing the build side that can be applied to the probe side. + 5: optional bool add_probe_filters +} + +struct TMergeJoinNode { + // anything from the ON, USING or WHERE clauses that's an equi-join predicate + 1: required list cmp_conjuncts + + // anything from the ON or USING clauses (but *not* the WHERE clause) that's not an + // equi-join predicate + 2: optional list other_join_conjuncts +} + +enum TAggregationOp { + INVALID, + COUNT, + MAX, + DISTINCT_PC, + DISTINCT_PCSA, + MIN, + SUM, + GROUP_CONCAT, + HLL, + COUNT_DISTINCT, + SUM_DISTINCT, + LEAD, + FIRST_VALUE, + LAST_VALUE, + RANK, + DENSE_RANK, + ROW_NUMBER, + LAG, + HLL_C, +} + +//struct TAggregateFunctionCall { + // The aggregate function to call. +// 1: required Types.TFunction fn + + // The input exprs to this aggregate function +// 2: required list input_exprs + + // If set, this aggregate function udf has varargs and this is the index for the + // first variable argument. +// 3: optional i32 vararg_start_idx +//} + +struct TAggregationNode { + 1: optional list grouping_exprs // aggregate exprs. The root of each expr is the aggregate function. The // other exprs are the inputs to the aggregate function. - 2: required list aggregate_functions + 2: required list aggregate_functions // Tuple id used for intermediate aggregations (with slots of agg intermediate types) 3: required Types.TTupleId intermediate_tuple_id @@ -322,216 +322,237 @@ struct TAggregationNode { // aggregate functions. 4: required Types.TTupleId output_tuple_id - // Set to true if this aggregation function requires finalization to complete after all - // rows have been aggregated, and this node is not an intermediate node. - 5: required bool need_finalize -} - -struct TPreAggregationNode { - 1: required list group_exprs - 2: required list aggregate_exprs -} - -struct TSortInfo { - 1: required list ordering_exprs - 2: required list is_asc_order - // Indicates, for each expr, if nulls should be listed first or last. This is - // independent of is_asc_order. - 3: required list nulls_first - // Expressions evaluated over the input row that materialize the tuple to be sorted. - // Contains one expr per slot in the materialized tuple. - 4: optional list sort_tuple_slot_exprs -} - -struct TSortNode { - 1: required TSortInfo sort_info - // Indicates whether the backend service should use topn vs. sorting - 2: required bool use_top_n; - // This is the number of rows to skip before returning results - 3: optional i64 offset - - // TODO(lingbin): remove blew, because duplaicate with TSortInfo - 4: optional list ordering_exprs - 5: optional list is_asc_order - // Indicates whether the imposed limit comes DEFAULT_ORDER_BY_LIMIT. - 6: optional bool is_default_limit - // Indicates, for each expr, if nulls should be listed first or last. This is - // independent of is_asc_order. - 7: optional list nulls_first - // Expressions evaluated over the input row that materialize the tuple to be so - // Contains one expr per slot in the materialized tuple. - 8: optional list sort_tuple_slot_exprs -} - -enum TAnalyticWindowType { - // Specifies the window as a logical offset - RANGE, - - // Specifies the window in physical units - ROWS -} - -enum TAnalyticWindowBoundaryType { - // The window starts/ends at the current row. - CURRENT_ROW, - - // The window starts/ends at an offset preceding current row. - PRECEDING, - - // The window starts/ends at an offset following current row. - FOLLOWING -} - -struct TAnalyticWindowBoundary { - 1: required TAnalyticWindowBoundaryType type - - // Predicate that checks: child tuple '<=' buffered tuple + offset for the orderby expr - 2: optional Exprs.TExpr range_offset_predicate - - // Offset from the current row for ROWS windows. - 3: optional i64 rows_offset_value -} - -struct TAnalyticWindow { - // Specifies the window type for the start and end bounds. - 1: required TAnalyticWindowType type - - // Absence indicates window start is UNBOUNDED PRECEDING. - 2: optional TAnalyticWindowBoundary window_start - - // Absence indicates window end is UNBOUNDED FOLLOWING. - 3: optional TAnalyticWindowBoundary window_end -} - -// Defines a group of one or more analytic functions that share the same window, -// partitioning expressions and order-by expressions and are evaluated by a single -// ExecNode. -struct TAnalyticNode { - // Exprs on which the analytic function input is partitioned. Input is already sorted - // on partitions and order by clauses, partition_exprs is used to identify partition - // boundaries. Empty if no partition clause is specified. - 1: required list partition_exprs - - // Exprs specified by an order-by clause for RANGE windows. Used to evaluate RANGE - // window boundaries. Empty if no order-by clause is specified or for windows - // specifying ROWS. - 2: required list order_by_exprs - - // Functions evaluated over the window for each input row. The root of each expr is - // the aggregate function. Child exprs are the inputs to the function. - 3: required list analytic_functions - - // Window specification - 4: optional TAnalyticWindow window - - // Tuple used for intermediate results of analytic function evaluations - // (with slots of analytic intermediate types) - 5: required Types.TTupleId intermediate_tuple_id - - // Tupld used for the analytic function output (with slots of analytic output types) - // Equal to intermediate_tuple_id if intermediate type == output type for all - // analytic functions. - 6: required Types.TTupleId output_tuple_id - - // id of the buffered tuple (identical to the input tuple, which is assumed - // to come from a single SortNode); not set if both partition_exprs and - // order_by_exprs are empty - 7: optional Types.TTupleId buffered_tuple_id - - // predicate that checks: child tuple is in the same partition as the buffered tuple, - // i.e. each partition expr is equal or both are not null. Only set if - // buffered_tuple_id is set; should be evaluated over a row that is composed of the - // child tuple and the buffered tuple - 8: optional Exprs.TExpr partition_by_eq - - // predicate that checks: the order_by_exprs are equal or both NULL when evaluated - // over the child tuple and the buffered tuple. only set if buffered_tuple_id is set; - // should be evaluated over a row that is composed of the child tuple and the buffered - // tuple - 9: optional Exprs.TExpr order_by_eq -} - -struct TMergeNode { - // A MergeNode could be the left input of a join and needs to know which tuple to write. - 1: required Types.TTupleId tuple_id - // List or expr lists materialized by this node. - // There is one list of exprs per query stmt feeding into this merge node. - 2: required list> result_expr_lists - // Separate list of expr lists coming from a constant select stmts. - 3: required list> const_expr_lists -} - -struct TUnionNode { - // A UnionNode materializes all const/result exprs into this tuple. - 1: required Types.TTupleId tuple_id - // List or expr lists materialized by this node. - // There is one list of exprs per query stmt feeding into this union node. - 2: required list> result_expr_lists - // Separate list of expr lists coming from a constant select stmts. - 3: required list> const_expr_lists - // Index of the first child that needs to be materialized. - 4: required i64 first_materialized_child_idx -} - -struct TExchangeNode { - // The ExchangeNode's input rows form a prefix of the output rows it produces; - // this describes the composition of that prefix - 1: required list input_row_tuples - // For a merging exchange, the sort information. - 2: optional TSortInfo sort_info - // This is tHe number of rows to skip before returning results - 3: optional i64 offset -} - -struct TOlapRewriteNode { - 1: required list columns - 2: required list column_types - 3: required Types.TTupleId output_tuple_id -} - -struct TKuduScanNode { - 1: required Types.TTupleId tuple_id -} - -// This is essentially a union of all messages corresponding to subclasses -// of PlanNode. -struct TPlanNode { - // node id, needed to reassemble tree structure - 1: required Types.TPlanNodeId node_id - 2: required TPlanNodeType node_type - 3: required i32 num_children - 4: required i64 limit - 5: required list row_tuples - - // nullable_tuples[i] is true if row_tuples[i] is nullable - 6: required list nullable_tuples - 7: optional list conjuncts - - // Produce data in compact format. - 8: required bool compact_data - - // one field per PlanNode subclass - 11: optional THashJoinNode hash_join_node - 12: optional TAggregationNode agg_node - 13: optional TSortNode sort_node - 14: optional TMergeNode merge_node - 15: optional TExchangeNode exchange_node - 17: optional TMySQLScanNode mysql_scan_node - 18: optional TOlapScanNode olap_scan_node - 19: optional TCsvScanNode csv_scan_node - 20: optional TBrokerScanNode broker_scan_node - 21: optional TPreAggregationNode pre_agg_node - 22: optional TSchemaScanNode schema_scan_node - 23: optional TMergeJoinNode merge_join_node - 24: optional TMetaScanNode meta_scan_node - 25: optional TAnalyticNode analytic_node - 26: optional TOlapRewriteNode olap_rewrite_node - 27: optional TKuduScanNode kudu_scan_node - 28: optional TUnionNode union_node -} - -// A flattened representation of a tree of PlanNodes, obtained by depth-first -// traversal. -struct TPlan { - 1: required list nodes -} + // Set to true if this aggregation function requires finalization to complete after all + // rows have been aggregated, and this node is not an intermediate node. + 5: required bool need_finalize + 6: optional bool use_streaming_preaggregation +} + +struct TPreAggregationNode { + 1: required list group_exprs + 2: required list aggregate_exprs +} + +struct TSortInfo { + 1: required list ordering_exprs + 2: required list is_asc_order + // Indicates, for each expr, if nulls should be listed first or last. This is + // independent of is_asc_order. + 3: required list nulls_first + // Expressions evaluated over the input row that materialize the tuple to be sorted. + // Contains one expr per slot in the materialized tuple. + 4: optional list sort_tuple_slot_exprs +} + +struct TSortNode { + 1: required TSortInfo sort_info + // Indicates whether the backend service should use topn vs. sorting + 2: required bool use_top_n; + // This is the number of rows to skip before returning results + 3: optional i64 offset + + // TODO(lingbin): remove blew, because duplaicate with TSortInfo + 4: optional list ordering_exprs + 5: optional list is_asc_order + // Indicates whether the imposed limit comes DEFAULT_ORDER_BY_LIMIT. + 6: optional bool is_default_limit + // Indicates, for each expr, if nulls should be listed first or last. This is + // independent of is_asc_order. + 7: optional list nulls_first + // Expressions evaluated over the input row that materialize the tuple to be so + // Contains one expr per slot in the materialized tuple. + 8: optional list sort_tuple_slot_exprs +} + +enum TAnalyticWindowType { + // Specifies the window as a logical offset + RANGE, + + // Specifies the window in physical units + ROWS +} + +enum TAnalyticWindowBoundaryType { + // The window starts/ends at the current row. + CURRENT_ROW, + + // The window starts/ends at an offset preceding current row. + PRECEDING, + + // The window starts/ends at an offset following current row. + FOLLOWING +} + +struct TAnalyticWindowBoundary { + 1: required TAnalyticWindowBoundaryType type + + // Predicate that checks: child tuple '<=' buffered tuple + offset for the orderby expr + 2: optional Exprs.TExpr range_offset_predicate + + // Offset from the current row for ROWS windows. + 3: optional i64 rows_offset_value +} + +struct TAnalyticWindow { + // Specifies the window type for the start and end bounds. + 1: required TAnalyticWindowType type + + // Absence indicates window start is UNBOUNDED PRECEDING. + 2: optional TAnalyticWindowBoundary window_start + + // Absence indicates window end is UNBOUNDED FOLLOWING. + 3: optional TAnalyticWindowBoundary window_end +} + +// Defines a group of one or more analytic functions that share the same window, +// partitioning expressions and order-by expressions and are evaluated by a single +// ExecNode. +struct TAnalyticNode { + // Exprs on which the analytic function input is partitioned. Input is already sorted + // on partitions and order by clauses, partition_exprs is used to identify partition + // boundaries. Empty if no partition clause is specified. + 1: required list partition_exprs + + // Exprs specified by an order-by clause for RANGE windows. Used to evaluate RANGE + // window boundaries. Empty if no order-by clause is specified or for windows + // specifying ROWS. + 2: required list order_by_exprs + + // Functions evaluated over the window for each input row. The root of each expr is + // the aggregate function. Child exprs are the inputs to the function. + 3: required list analytic_functions + + // Window specification + 4: optional TAnalyticWindow window + + // Tuple used for intermediate results of analytic function evaluations + // (with slots of analytic intermediate types) + 5: required Types.TTupleId intermediate_tuple_id + + // Tupld used for the analytic function output (with slots of analytic output types) + // Equal to intermediate_tuple_id if intermediate type == output type for all + // analytic functions. + 6: required Types.TTupleId output_tuple_id + + // id of the buffered tuple (identical to the input tuple, which is assumed + // to come from a single SortNode); not set if both partition_exprs and + // order_by_exprs are empty + 7: optional Types.TTupleId buffered_tuple_id + + // predicate that checks: child tuple is in the same partition as the buffered tuple, + // i.e. each partition expr is equal or both are not null. Only set if + // buffered_tuple_id is set; should be evaluated over a row that is composed of the + // child tuple and the buffered tuple + 8: optional Exprs.TExpr partition_by_eq + + // predicate that checks: the order_by_exprs are equal or both NULL when evaluated + // over the child tuple and the buffered tuple. only set if buffered_tuple_id is set; + // should be evaluated over a row that is composed of the child tuple and the buffered + // tuple + 9: optional Exprs.TExpr order_by_eq +} + +struct TMergeNode { + // A MergeNode could be the left input of a join and needs to know which tuple to write. + 1: required Types.TTupleId tuple_id + // List or expr lists materialized by this node. + // There is one list of exprs per query stmt feeding into this merge node. + 2: required list> result_expr_lists + // Separate list of expr lists coming from a constant select stmts. + 3: required list> const_expr_lists +} + +struct TUnionNode { + // A UnionNode materializes all const/result exprs into this tuple. + 1: required Types.TTupleId tuple_id + // List or expr lists materialized by this node. + // There is one list of exprs per query stmt feeding into this union node. + 2: required list> result_expr_lists + // Separate list of expr lists coming from a constant select stmts. + 3: required list> const_expr_lists + // Index of the first child that needs to be materialized. + 4: required i64 first_materialized_child_idx +} + +struct TExchangeNode { + // The ExchangeNode's input rows form a prefix of the output rows it produces; + // this describes the composition of that prefix + 1: required list input_row_tuples + // For a merging exchange, the sort information. + 2: optional TSortInfo sort_info + // This is tHe number of rows to skip before returning results + 3: optional i64 offset +} + +struct TOlapRewriteNode { + 1: required list columns + 2: required list column_types + 3: required Types.TTupleId output_tuple_id +} + +struct TKuduScanNode { + 1: required Types.TTupleId tuple_id +} + +// This contains all of the information computed by the plan as part of the resource +// profile that is needed by the backend to execute. +struct TBackendResourceProfile { +// The minimum reservation for this plan node in bytes. +1: required i64 min_reservation = 0; // no support reservation + +// The maximum reservation for this plan node in bytes. MAX_INT64 means effectively +// unlimited. +2: required i64 max_reservation = 12188490189880; // no max reservation limit + +// The spillable buffer size in bytes to use for this node, chosen by the planner. +// Set iff the node uses spillable buffers. +3: optional i64 spillable_buffer_size = 2097152 + +// The buffer size in bytes that is large enough to fit the largest row to be processed. +// Set if the node allocates buffers for rows from the buffer pool. +4: optional i64 max_row_buffer_size = 4194304 //TODO chenhao +} + +// This is essentially a union of all messages corresponding to subclasses +// of PlanNode. +struct TPlanNode { + // node id, needed to reassemble tree structure + 1: required Types.TPlanNodeId node_id + 2: required TPlanNodeType node_type + 3: required i32 num_children + 4: required i64 limit + 5: required list row_tuples + + // nullable_tuples[i] is true if row_tuples[i] is nullable + 6: required list nullable_tuples + 7: optional list conjuncts + + // Produce data in compact format. + 8: required bool compact_data + + // one field per PlanNode subclass + 11: optional THashJoinNode hash_join_node + 12: optional TAggregationNode agg_node + 13: optional TSortNode sort_node + 14: optional TMergeNode merge_node + 15: optional TExchangeNode exchange_node + 17: optional TMySQLScanNode mysql_scan_node + 18: optional TOlapScanNode olap_scan_node + 19: optional TCsvScanNode csv_scan_node + 20: optional TBrokerScanNode broker_scan_node + 21: optional TPreAggregationNode pre_agg_node + 22: optional TSchemaScanNode schema_scan_node + 23: optional TMergeJoinNode merge_join_node + 24: optional TMetaScanNode meta_scan_node + 25: optional TAnalyticNode analytic_node + 26: optional TOlapRewriteNode olap_rewrite_node + 27: optional TKuduScanNode kudu_scan_node + 28: optional TUnionNode union_node + 29: optional TBackendResourceProfile resource_profile +} + +// A flattened representation of a tree of PlanNodes, obtained by depth-first +// traversal. +struct TPlan { + 1: required list nodes +} diff --git a/gensrc/thrift/Planner.thrift b/gensrc/thrift/Planner.thrift index 9653dd144a..09da5bb8db 100644 --- a/gensrc/thrift/Planner.thrift +++ b/gensrc/thrift/Planner.thrift @@ -54,6 +54,16 @@ struct TPlanFragment { // This is distinct from the partitioning of each plan fragment's // output, which is specified by output_sink.output_partitioning. 6: required Partitions.TDataPartition partition + + // The minimum reservation size (in bytes) required for an instance of this plan + // fragment to execute on a single host. + 7: optional i64 min_reservation_bytes + + // Total of the initial buffer reservations that we expect to be claimed by this + // fragment. I.e. the sum of the min reservations over all operators (including the + // sink) in a single instance of this fragment. This is used for an optimization in + // InitialReservation. Measured in bytes. required in V1 + 8: optional i64 initial_reservation_total_claims } // location information for a single scan range diff --git a/gensrc/thrift/Status.thrift b/gensrc/thrift/Status.thrift index 500f7b4f81..b92be6ef39 100644 --- a/gensrc/thrift/Status.thrift +++ b/gensrc/thrift/Status.thrift @@ -32,7 +32,10 @@ enum TStatusCode { THRIFT_RPC_ERROR, TIMEOUT, KUDU_NOT_ENABLED, - KUDU_NOT_SUPPORTED_ON_OS + KUDU_NOT_SUPPORTED_ON_OS, + MEM_ALLOC_FAILED, + BUFFER_ALLOCATION_FAILED, + MINIMUM_RESERVATION_UNAVAILABLE } struct TStatus { diff --git a/gensrc/thrift/Types.thrift b/gensrc/thrift/Types.thrift index cccd809db2..0bb787a97c 100644 --- a/gensrc/thrift/Types.thrift +++ b/gensrc/thrift/Types.thrift @@ -243,6 +243,7 @@ struct TAggregateFunction { 6: optional string finalize_fn_symbol 8: optional string get_value_fn_symbol 9: optional string remove_fn_symbol + 10: optional bool is_analytic_only_fn = false } // Represents a function in the Catalog. diff --git a/thirdparty/build-thirdparty.sh b/thirdparty/build-thirdparty.sh index b78698d088..5bb2f8dc8c 100755 --- a/thirdparty/build-thirdparty.sh +++ b/thirdparty/build-thirdparty.sh @@ -436,6 +436,31 @@ build_mysql() { echo "mysql client lib is installed." } +#leveldb +build_leveldb() { + check_if_source_exist $LEVELDB_SOURCE + + cd $TP_SOURCE_DIR/$LEVELDB_SOURCE + CXXFLAGS="-fPIC" make -j$PARALLEL + cp out-static/libleveldb.a ../../installed/lib/libleveldb.a + cp -r include/leveldb ../../installed/include/ +} + +# brpc +build_brpc() { + check_if_source_exist $BRPC_SOURCE + if [ ! -f $CMAKE_CMD ]; then + echo "cmake executable does not exit" + exit 1 + fi + + cd $TP_SOURCE_DIR/$BRPC_SOURCE + mkdir build -p && cd build + rm -rf CMakeCache.txt CMakeFiles/ + $CMAKE_CMD -v -DCMAKE_INSTALL_PREFIX=$TP_INSTALL_DIR -DBRPC_WITH_GLOG=ON -DCMAKE_INCLUDE_PATH="$TP_INSTALL_DIR/include" -DCMAKE_LIBRARY_PATH="$TP_INSTALL_DIR/lib;$TP_INSTALL_DIR/lib64" .. + make -j$PARALLEL && make install +} + build_libevent build_openssl build_zlib @@ -451,11 +476,12 @@ build_glog build_gtest build_rapidjson build_snappy -# build_libunwind // deprecated build_gperftools build_curl build_re2 build_mysql build_thrift +build_leveldb +build_brpc echo "Finihsed to build all thirdparties" diff --git a/thirdparty/vars.sh b/thirdparty/vars.sh index a294857362..1a5de32ab6 100644 --- a/thirdparty/vars.sh +++ b/thirdparty/vars.sh @@ -70,19 +70,19 @@ THRIFT_NAME=thrift-0.9.3.tar.gz THRIFT_SOURCE=thrift-0.9.3 # llvm -LLVM_DOWNLOAD="http://releases.llvm.org/3.3/llvm-3.3.src.tar.gz" -LLVM_NAME=llvm-3.3.src.tar.gz -LLVM_SOURCE=llvm-3.3.src +LLVM_DOWNLOAD="http://releases.llvm.org/3.4.2/llvm-3.4.2.src.tar.gz" +LLVM_NAME=llvm-3.4.2.src.tar.gz +LLVM_SOURCE=llvm-3.4.2.src # clang -CLANG_DOWNLOAD="http://releases.llvm.org/3.3/cfe-3.3.src.tar.gz" -CLANG_NAME=cfe-3.3.src.tar.gz -CLANG_SOURCE=cfe-3.3.src +CLANG_DOWNLOAD="http://releases.llvm.org/3.4.2/cfe-3.4.2.src.tar.gz" +CLANG_NAME=cfe-3.4.2.src.tar.gz +CLANG_SOURCE=cfe-3.4.2.src # compiler-rt -COMPILER_RT_DOWNLOAD="http://releases.llvm.org/3.3/compiler-rt-3.3.src.tar.gz" -COMPILER_RT_NAME=compiler-rt-3.3.src.tar.gz -COMPILER_RT_SOURCE=compiler-rt-3.3.src +COMPILER_RT_DOWNLOAD="http://releases.llvm.org/3.4/compiler-rt-3.4.src.tar.gz" +COMPILER_RT_NAME=compiler-rt-3.4.src.tar.gz +COMPILER_RT_SOURCE=compiler-rt-3.4 # protobuf PROTOBUF_DOWNLOAD="https://github.com/google/protobuf/releases/download/v2.6.1/protobuf-2.6.1.tar.gz" @@ -169,5 +169,15 @@ BOOST_FOR_MYSQL_DOWNLOAD="http://sourceforge.net/projects/boost/files/boost/1.59 BOOST_FOR_MYSQL_NAME=boost_1_59_0.tar.gz BOOST_FOR_MYSQL_SOURCE=boost_1_59_0 +# leveldb +LEVELDB_DOWNLOAD="https://github.com/google/leveldb/archive/v1.20.tar.gz" +LEVELDB_NAME=leveldb-1.20.tar.gz +LEVELDB_SOURCE=leveldb-1.20 + +# brpc +BRPC_DOWNLOAD="https://github.com/brpc/brpc/archive/v0.9.0.tar.gz" +BRPC_NAME=brpc-0.9.0.tar.gz +BRPC_SOURCE=brpc-0.9.0 + # all thirdparties which need to be downloaded is set in array TP_ARCHIVES -export TP_ARCHIVES=(LIBEVENT OPENSSL THRIFT LLVM CLANG COMPILER_RT PROTOBUF GFLAGS GLOG GTEST RAPIDJSON SNAPPY GPERFTOOLS ZLIB LZ4 BZIP LZO2 NCURSES CURL RE2 BOOST MYSQL BOOST_FOR_MYSQL) +export TP_ARCHIVES=(LIBEVENT OPENSSL THRIFT LLVM CLANG COMPILER_RT PROTOBUF GFLAGS GLOG GTEST RAPIDJSON SNAPPY GPERFTOOLS ZLIB LZ4 BZIP LZO2 NCURSES CURL RE2 BOOST MYSQL BOOST_FOR_MYSQL LEVELDB BRPC)